The purpose of this document is to put together cluster analysis of the BGT Resume data for nonveterans and veterans.

# Here we create the function to cluster sequences. We can input the sequence object to cluster, one of three sequence distance options (coming from the testing_cluster_combos top performers) and the number of clusters

cluster <- function(name, object, sequence_option = "option 1", cluster_number) {
transition_matrix <- seqtrate(object, weighted=FALSE, count=TRUE)
round(transition_matrix,2)
diag(transition_matrix) = 0
round(transition_matrix,2)
rowsum <- apply(transition_matrix, 1, sum)
for(i in 1:nrow(transition_matrix)) {
  for (j in 1:ncol(transition_matrix)){
    transition_matrix[i,j] = transition_matrix[i,j]/rowsum[i]
  }
}
for(j in 1:ncol(transition_matrix)){
      transition_matrix[7,j] = transition_matrix[7,j] = 0
}
transition_matrix <- round(transition_matrix,2)
cost_matrix <- 2-transition_matrix
round(cost_matrix,2)
diag(cost_matrix) = 0
cost_matrix <- round(cost_matrix,2)

if (sequence_option == "option 1"){
# OPTION 1 : OUR OM SPELL, LOW EXPANSION COST AND LOW WEIGHT
sequencing <- seqdist(object, method = "OMspell", indel = "auto", sm = cost_matrix, with.missing = TRUE, otto = 0, expcost = 0)
}
if (sequence_option == "option 2"){
# OPTION 2 : OUR OM
sequencing <- seqdist(object, method = "OMstran", indel = "auto", sm = cost_matrix, with.missing = TRUE, otto = 0.5)
}
if (sequence_option == "option 3"){
# OPTION 3 : OUR OM, LOW WEIGHT ON ORIGIN
sequencing <- seqdist(object, method = "OMstran", indel = 1, sm = cost_matrix, with.missing = TRUE, otto = 0.01)
}
clusterpam <- wcKMedoids(sequencing, k = cluster_number)
#clusterpam$stats

#clusterpam <- clusterpam$clustering
#clusterpam <- factor(clusterpam)

cluster <- list(sequencing, clusterpam)
names(cluster) <- c("diss", "cluster")

assign(name, cluster, envir = .GlobalEnv)
}

# Here we create the function to visualize most of the covariates 
cluster_membership_visualization <- function(variable) {
  all_df <- all_df %>% select(-skill, -major) %>% distinct()
  overall <- all_df %>% filter(!is.na({{ variable }})) %>% mutate(total = n()) %>% group_by(cluster) %>% mutate(overall_cluster_count = n()) %>% group_by({{ variable }}) %>% mutate(overall_covariate_percent = round((n()/total * 100), 2)) %>% distinct({{ variable }}, overall_covariate_percent) %>% mutate(cluster_label = "overall", cluster_group = "overall", overall_cluster_percent = .1) %>% rename(cluster_covariate_percent = overall_covariate_percent)
all_df %>% filter(!is.na({{ variable }})) %>% mutate(total = n()) %>% group_by(cluster) %>% mutate(overall_cluster_count = n(), overall_cluster_percent = n()/total) %>% group_by({{ variable }}) %>% mutate(overall_covariate_percent = round((n()/total * 100), 2)) %>% group_by(cluster, {{ variable }}) %>% mutate(cluster_covariate_percent = round((n()/overall_cluster_count * 100), 2)) %>% distinct(cluster, cluster_label, cluster_group, {{ variable }}, cluster_covariate_percent, overall_cluster_percent) %>% rbind(overall) %>%
  ggplot(aes(cluster_covariate_percent, cluster_label, fill = factor({{ variable }}))) +
  geom_col(aes(width = overall_cluster_percent*10)) +
  facet_wrap(~ factor(cluster_group, levels = c("overall", "education", "promotion", "no change", "demotion", "unemployment")), ncol = 1, scales="free")
}

# Here we create the function to visualize the skill and major covariates
cluster_membership_visualization_skillmajor <- function(variable) {
  df <- all_df %>%
  filter(!is.na({{ variable }})) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, {{variable}}) %>%
  group_by({{ variable }}) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
  group_by(cluster) %>%
  mutate(cluster_total = n()) %>%
  group_by(cluster, {{ variable }}) %>%
  mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
  distinct({{ variable }}, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
  group_by({{ variable }}) %>% mutate(sum = sum(percent))

df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na({{ variable }}))
df <- as.data.frame(df)

ggplot(df, aes(cluster_label, {{ variable }}, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0, limits = c(-1, 1), oob = squish, name = "Factor of difference \n of variable percent in \n cluster to variable \n percent overall", breaks = c(-1, 0, 1), labels = c("-2x or less", "0", "2x or more"))+
  theme_classic()+
  #labs(title = paste0("Cluster membership by ", as.character({{ variable }}))) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_grid(~cluster_group, switch = "x", scales = "free_x", space = "free_x")
}
# Here we create the clusters for both veterans and nonveterans using a function from the master_functions script
# We aren't building in any years of experience for nonveterans here
# These sequence objects are what is used in the test of cluster quality metrics

# NO YEARS EXPERIENCE ------------------------------------------------------------------------
# Creating sequence objects fo 10, 20, and 30 years
seq.10 <- create_seq_all_fixed("seq.10", 10, left_unemp = FALSE, 0)
seq.20 <- create_seq_all_fixed("seq.20", 20, left_unemp = FALSE, 0)
seq.30 <- create_seq_all_fixed("seq.30", 30, left_unemp = FALSE, 0)

# The sequence option and the number of clusters are chosen from the cluster quality metric test

# Clustering sequnces
seq.10cluster_20 <- cluster("cluster_10", seq.10, sequence_option = "option 1", 20)
seq.20cluster_20 <- cluster("cluster_10", seq.20, sequence_option = "option 2", 20)
seq.30cluster_10 <- cluster("cluster_10", seq.30, sequence_option = "option 2", 10)

Final clustering: - Veterans and nonveterans and all - 10, 20, 30 years

HIGH PERFORMERS: 10 YEARS: OPT 3 19-20, OPT 2 5, OPT 1 19-20 20 YEARS: OPT 2 6, OPT 2 19-20 30 YEARS: OPT 2 5, OPT 2 19-20

# Here we test the quality of clusters to see what combination of sequence_option and number of clusters performs the best for four cluster quality metrics
# A more in depth description of this process is explained in 19_testing_cluster_combos

#seq.10 <- create_seq_all_fixed("seq.10", 10, left_unemp = FALSE)
#seq.20 <- create_seq_all_fixed("seq.20", 20, left_unemp = FALSE)
#seq.30 <- create_seq_all_fixed("seq.30", 30, left_unemp = FALSE)

## TEN YEARS --------------
# Creating transition and cost matrix for sequence object
transition_matrix <- seqtrate(seq.10, weighted=FALSE, count=TRUE)
round(transition_matrix,2)
##                            [-> 1] [-> 2] [-> 3] [-> 4] [-> 5] [-> 6]
## [1 ->]                        396     54     21     21      8     23
## [2 ->]                         37  16088    872   1001    187    421
## [3 ->]                         12    572  20513   1578    344    510
## [4 ->]                         11    669   1010  44689   1035   1141
## [5 ->]                          6    124    282    917  10606    327
## [6 ->]                         16    511    734   1747    616  25063
## [Civilian unemployment ->]     35    677    663   1219    373    528
##                            [-> Civilian unemployment]
## [1 ->]                                             35
## [2 ->]                                            798
## [3 ->]                                            763
## [4 ->]                                           1271
## [5 ->]                                            362
## [6 ->]                                           2085
## [Civilian unemployment ->]                       9707
diag(transition_matrix) = 0
round(transition_matrix,2)
##                            [-> 1] [-> 2] [-> 3] [-> 4] [-> 5] [-> 6]
## [1 ->]                          0     54     21     21      8     23
## [2 ->]                         37      0    872   1001    187    421
## [3 ->]                         12    572      0   1578    344    510
## [4 ->]                         11    669   1010      0   1035   1141
## [5 ->]                          6    124    282    917      0    327
## [6 ->]                         16    511    734   1747    616      0
## [Civilian unemployment ->]     35    677    663   1219    373    528
##                            [-> Civilian unemployment]
## [1 ->]                                             35
## [2 ->]                                            798
## [3 ->]                                            763
## [4 ->]                                           1271
## [5 ->]                                            362
## [6 ->]                                           2085
## [Civilian unemployment ->]                          0
rowsum <- apply(transition_matrix, 1, sum)
for(i in 1:nrow(transition_matrix)) {
  for (j in 1:ncol(transition_matrix)){
    transition_matrix[i,j] = transition_matrix[i,j]/rowsum[i]
  }
}
for(j in 1:ncol(transition_matrix)){
      transition_matrix[7,j] = transition_matrix[7,j] = 0
}
transition_matrix <- round(transition_matrix,2)
cost_matrix <- 2-transition_matrix
round(cost_matrix,2)
##                            [-> 1] [-> 2] [-> 3] [-> 4] [-> 5] [-> 6]
## [1 ->]                       2.00   1.67   1.87   1.87   1.95   1.86
## [2 ->]                       1.99   2.00   1.74   1.70   1.94   1.87
## [3 ->]                       2.00   1.85   2.00   1.58   1.91   1.87
## [4 ->]                       2.00   1.87   1.80   2.00   1.80   1.78
## [5 ->]                       2.00   1.94   1.86   1.55   2.00   1.84
## [6 ->]                       2.00   1.91   1.87   1.69   1.89   2.00
## [Civilian unemployment ->]   2.00   2.00   2.00   2.00   2.00   2.00
##                            [-> Civilian unemployment]
## [1 ->]                                           1.78
## [2 ->]                                           1.76
## [3 ->]                                           1.80
## [4 ->]                                           1.75
## [5 ->]                                           1.82
## [6 ->]                                           1.63
## [Civilian unemployment ->]                       2.00
diag(cost_matrix) = 0
cost_matrix <- round(cost_matrix,2)

# Creating distance matricies for the three distance options
# OPTION 1 : OUR OM SPELL, LOW EXPANSION COST AND LOW WEIGHT
sequencing.opt1 <- seqdist(seq.10, method = "OMspell", indel = "auto", sm = cost_matrix, with.missing = TRUE, otto = 0, expcost = 0)

# OPTION 2 : OUR OM
sequencing.opt2 <- seqdist(seq.10, method = "OMstran", indel = "auto", sm = cost_matrix, with.missing = TRUE, otto = 0.5)

# OPTION 3 : OUR OM, LOW WEIGHT ON ORIGIN
sequencing.opt3 <- seqdist(seq.10, method = "OMstran", indel = 1, sm = cost_matrix, with.missing = TRUE, otto = 0.01)

# Beginning the clustering, we are only using PAM clustering so we are varying the number of clusters and the distance option here
# CLUSTERING
numbers <- (2:20)
pamstats <- NULL
#rownames(pamstats) <- numbers
pamstats <- NULL
for (number in numbers) {
  clusterpam <- wcKMedoids(sequencing.opt1, k = number)
  assign(paste0("sequencing.pam.opt1.", number), clusterpam, envir = .GlobalEnv)
  pamstats[[number]] <- clusterpam$stats
}
pamstats.opt1 <- dplyr::bind_rows(pamstats)
pamstats <- NULL
for (number in numbers) {
  clusterpam <- wcKMedoids(sequencing.opt2, k = number)
  assign(paste0("sequencing.pam.opt2.", number), clusterpam, envir = .GlobalEnv)
  pamstats[[number]] <- clusterpam$stats
}
pamstats.opt2 <- dplyr::bind_rows(pamstats)
for (number in numbers) {
  clusterpam <- wcKMedoids(sequencing.opt3, k = number)
  assign(paste0("sequencing.pam.opt3.", number), clusterpam, envir = .GlobalEnv)
  pamstats[[number]] <- clusterpam$stats
}
pamstats.opt3 <- dplyr::bind_rows(pamstats)

# Here we are getting the 4 quality metrics for each solution to compare them
# QUALITY METRICS
rownames(pamstats.opt1) <- numbers
PAM.ASW.Opt1 <- as.matrix(pamstats.opt1$ASW)
colnames(PAM.ASW.Opt1) <- c("ASW opt1")
PAM.ASW.Opt2 <- as.matrix(pamstats.opt2$ASW)
colnames(PAM.ASW.Opt2) <- c("ASW opt2")
PAM.ASW.Opt3 <- as.matrix(pamstats.opt3$ASW)
colnames(PAM.ASW.Opt3) <- c("ASW opt3")
PAM.ASW <- bind_cols(PAM.ASW.Opt1, PAM.ASW.Opt2, PAM.ASW.Opt3)
PAM.ASW <- PAM.ASW %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.ASW$rowname <- as.numeric(PAM.ASW$rowname)

rownames(pamstats.opt1) <- numbers
PAM.HC.Opt1 <- as.matrix(pamstats.opt1$HC)
colnames(PAM.HC.Opt1) <- c("HC opt1")
PAM.HC.Opt2 <- as.matrix(pamstats.opt2$HC)
colnames(PAM.HC.Opt2) <- c("HC opt2")
PAM.HC.Opt3 <- as.matrix(pamstats.opt3$HC)
colnames(PAM.HC.Opt3) <- c("HC opt3")
PAM.HC <- bind_cols(PAM.HC.Opt1, PAM.HC.Opt2, PAM.HC.Opt3)
PAM.HC <- PAM.HC %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.HC$rowname <- as.numeric(PAM.HC$rowname)

rownames(pamstats.opt1) <- numbers
PAM.PBC.Opt1 <- as.matrix(pamstats.opt1$PBC)
colnames(PAM.PBC.Opt1) <- c("PBC opt1")
PAM.PBC.Opt2 <- as.matrix(pamstats.opt2$PBC)
colnames(PAM.PBC.Opt2) <- c("PBC opt2")
PAM.PBC.Opt3 <- as.matrix(pamstats.opt3$PBC)
colnames(PAM.PBC.Opt3) <- c("PBC opt3")
PAM.PBC <- bind_cols(PAM.PBC.Opt1, PAM.PBC.Opt2, PAM.PBC.Opt3)
PAM.PBC <- PAM.PBC %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.PBC$rowname <- as.numeric(PAM.PBC$rowname)

rownames(pamstats.opt1) <- numbers
PAM.R2.Opt1 <- as.matrix(pamstats.opt1$R2)
colnames(PAM.R2.Opt1) <- c("R2 opt1")
PAM.R2.Opt2 <- as.matrix(pamstats.opt2$R2)
colnames(PAM.R2.Opt2) <- c("R2 opt2")
PAM.R2.Opt3 <- as.matrix(pamstats.opt3$R2)
colnames(PAM.R2.Opt3) <- c("R2 opt3")
PAM.R2 <- bind_cols(PAM.R2.Opt1, PAM.R2.Opt2, PAM.R2.Opt3)
PAM.R2 <- PAM.R2 %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.R2$rowname <- as.numeric(PAM.R2$rowname)

# Here we visualize the results comparing each cluster solution
PAM.ASW %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
  #facet_grid(. ~ group, scales = "free") +
  geom_tile() +
  scale_fill_continuous(name = "ASW value", type = "viridis") +
#  labs(title = "10 Years - ASW (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
  scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
  ylab("number of clusters") +
  ylim(1,21) +
  theme_bw()

PAM.HC %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
  #facet_grid(. ~ group, scales = "free") +
  geom_tile() +
  scale_fill_continuous(name = "HC value", type = "viridis") +
 # labs(title = "10 Years - HC (optimal 0)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
  scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
  ylab("number of clusters") +
  ylim(1,21) +
  theme_bw()

PAM.PBC %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
  #facet_grid(. ~ group, scales = "free") +
  geom_tile() +
  scale_fill_continuous(name = "PBC value", type = "viridis") +
#  labs(title = "10 Years - PBC (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
  scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
  ylab("number of clusters") +
  ylim(1,21) +
  theme_bw()

PAM.R2 %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
  #facet_grid(. ~ group, scales = "free") +
  geom_tile() +
  scale_fill_continuous(name = "R^2 value", type = "viridis") +
#  labs(title = "10 Years - R2 (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
  scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
  ylab("number of clusters") +
  ylim(1,21) +
  theme_bw() +
  coord_cartesian(clip = 'off') +
      theme(plot.margin = unit(c(1,3,1,1), "lines")) 

## TWENTY YEARS ---------------------
transition_matrix <- seqtrate(seq.20, weighted=FALSE, count=TRUE)
round(transition_matrix,2)
##                            [-> 1] [-> 2] [-> 3] [-> 4] [-> 5] [-> 6]
## [1 ->]                        192     15      8     10      3      6
## [2 ->]                          7  12040    470    662    116    176
## [3 ->]                          6    364  17773   1119    223    311
## [4 ->]                          8    505    771  48192    989    835
## [5 ->]                          0     83    185    921  11503    251
## [6 ->]                          4    198    422   1318    452  15065
## [Civilian unemployment ->]     18    515    643   1367    423    455
##                            [-> Civilian unemployment]
## [1 ->]                                             15
## [2 ->]                                            467
## [3 ->]                                            592
## [4 ->]                                           1099
## [5 ->]                                            306
## [6 ->]                                           1561
## [Civilian unemployment ->]                      13547
diag(transition_matrix) = 0
round(transition_matrix,2)
##                            [-> 1] [-> 2] [-> 3] [-> 4] [-> 5] [-> 6]
## [1 ->]                          0     15      8     10      3      6
## [2 ->]                          7      0    470    662    116    176
## [3 ->]                          6    364      0   1119    223    311
## [4 ->]                          8    505    771      0    989    835
## [5 ->]                          0     83    185    921      0    251
## [6 ->]                          4    198    422   1318    452      0
## [Civilian unemployment ->]     18    515    643   1367    423    455
##                            [-> Civilian unemployment]
## [1 ->]                                             15
## [2 ->]                                            467
## [3 ->]                                            592
## [4 ->]                                           1099
## [5 ->]                                            306
## [6 ->]                                           1561
## [Civilian unemployment ->]                          0
rowsum <- apply(transition_matrix, 1, sum)
for(i in 1:nrow(transition_matrix)) {
  for (j in 1:ncol(transition_matrix)){
    transition_matrix[i,j] = transition_matrix[i,j]/rowsum[i]
  }
}
for(j in 1:ncol(transition_matrix)){
      transition_matrix[7,j] = transition_matrix[7,j] = 0
}
transition_matrix <- round(transition_matrix,2)
cost_matrix <- 2-transition_matrix
round(cost_matrix,2)
##                            [-> 1] [-> 2] [-> 3] [-> 4] [-> 5] [-> 6]
## [1 ->]                          2   1.74   1.86   1.82   1.95   1.89
## [2 ->]                          2   2.00   1.75   1.65   1.94   1.91
## [3 ->]                          2   1.86   2.00   1.57   1.91   1.88
## [4 ->]                          2   1.88   1.82   2.00   1.76   1.80
## [5 ->]                          2   1.95   1.89   1.47   2.00   1.86
## [6 ->]                          2   1.95   1.89   1.67   1.89   2.00
## [Civilian unemployment ->]      2   2.00   2.00   2.00   2.00   2.00
##                            [-> Civilian unemployment]
## [1 ->]                                           1.74
## [2 ->]                                           1.75
## [3 ->]                                           1.77
## [4 ->]                                           1.74
## [5 ->]                                           1.82
## [6 ->]                                           1.61
## [Civilian unemployment ->]                       2.00
diag(cost_matrix) = 0
cost_matrix <- round(cost_matrix,2)

# OPTION 1 : OUR OM SPELL, LOW EXPANSION COST AND LOW WEIGHT
sequencing.opt1 <- seqdist(seq.20, method = "OMspell", indel = "auto", sm = cost_matrix, with.missing = TRUE, otto = 0, expcost = 0)

# OPTION 2 : OUR OM
sequencing.opt2 <- seqdist(seq.20, method = "OMstran", indel = "auto", sm = cost_matrix, with.missing = TRUE, otto = 0.5)

# OPTION 3 : OUR OM, LOW WEIGHT ON ORIGIN
sequencing.opt3 <- seqdist(seq.20, method = "OMstran", indel = 1, sm = cost_matrix, with.missing = TRUE, otto = 0.01)

# CLUSTERING
numbers <- (2:20)
pamstats <- NULL
#rownames(pamstats) <- numbers
pamstats <- NULL
for (number in numbers) {
  clusterpam <- wcKMedoids(sequencing.opt1, k = number)
  assign(paste0("sequencing.pam.opt1.", number), clusterpam, envir = .GlobalEnv)
  pamstats[[number]] <- clusterpam$stats
}
pamstats.opt1 <- dplyr::bind_rows(pamstats)
pamstats <- NULL
for (number in numbers) {
  clusterpam <- wcKMedoids(sequencing.opt2, k = number)
  assign(paste0("sequencing.pam.opt2.", number), clusterpam, envir = .GlobalEnv)
  pamstats[[number]] <- clusterpam$stats
}
pamstats.opt2 <- dplyr::bind_rows(pamstats)
for (number in numbers) {
  clusterpam <- wcKMedoids(sequencing.opt3, k = number)
  assign(paste0("sequencing.pam.opt3.", number), clusterpam, envir = .GlobalEnv)
  pamstats[[number]] <- clusterpam$stats
}
pamstats.opt3 <- dplyr::bind_rows(pamstats)

# QUALITY METRICS
rownames(pamstats.opt1) <- numbers
PAM.ASW.Opt1 <- as.matrix(pamstats.opt1$ASW)
colnames(PAM.ASW.Opt1) <- c("ASW opt1")
PAM.ASW.Opt2 <- as.matrix(pamstats.opt2$ASW)
colnames(PAM.ASW.Opt2) <- c("ASW opt2")
PAM.ASW.Opt3 <- as.matrix(pamstats.opt3$ASW)
colnames(PAM.ASW.Opt3) <- c("ASW opt3")
PAM.ASW <- bind_cols(PAM.ASW.Opt1, PAM.ASW.Opt2, PAM.ASW.Opt3)
PAM.ASW <- PAM.ASW %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.ASW$rowname <- as.numeric(PAM.ASW$rowname)

rownames(pamstats.opt1) <- numbers
PAM.HC.Opt1 <- as.matrix(pamstats.opt1$HC)
colnames(PAM.HC.Opt1) <- c("HC opt1")
PAM.HC.Opt2 <- as.matrix(pamstats.opt2$HC)
colnames(PAM.HC.Opt2) <- c("HC opt2")
PAM.HC.Opt3 <- as.matrix(pamstats.opt3$HC)
colnames(PAM.HC.Opt3) <- c("HC opt3")
PAM.HC <- bind_cols(PAM.HC.Opt1, PAM.HC.Opt2, PAM.HC.Opt3)
PAM.HC <- PAM.HC %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.HC$rowname <- as.numeric(PAM.HC$rowname)

rownames(pamstats.opt1) <- numbers
PAM.PBC.Opt1 <- as.matrix(pamstats.opt1$PBC)
colnames(PAM.PBC.Opt1) <- c("PBC opt1")
PAM.PBC.Opt2 <- as.matrix(pamstats.opt2$PBC)
colnames(PAM.PBC.Opt2) <- c("PBC opt2")
PAM.PBC.Opt3 <- as.matrix(pamstats.opt3$PBC)
colnames(PAM.PBC.Opt3) <- c("PBC opt3")
PAM.PBC <- bind_cols(PAM.PBC.Opt1, PAM.PBC.Opt2, PAM.PBC.Opt3)
PAM.PBC <- PAM.PBC %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.PBC$rowname <- as.numeric(PAM.PBC$rowname)

rownames(pamstats.opt1) <- numbers
PAM.R2.Opt1 <- as.matrix(pamstats.opt1$R2)
colnames(PAM.R2.Opt1) <- c("R2 opt1")
PAM.R2.Opt2 <- as.matrix(pamstats.opt2$R2)
colnames(PAM.R2.Opt2) <- c("R2 opt2")
PAM.R2.Opt3 <- as.matrix(pamstats.opt3$R2)
colnames(PAM.R2.Opt3) <- c("R2 opt3")
PAM.R2 <- bind_cols(PAM.R2.Opt1, PAM.R2.Opt2, PAM.R2.Opt3)
PAM.R2 <- PAM.R2 %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.R2$rowname <- as.numeric(PAM.R2$rowname)

PAM.ASW %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
  #facet_grid(. ~ group, scales = "free") +
  geom_tile() +
  scale_fill_continuous(name = "ASW value", type = "viridis") +
#  labs(title = "20 Years - ASW (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
  scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
  ylab("number of clusters") +
  ylim(1,21) +
  theme_bw()

PAM.HC %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
  #facet_grid(. ~ group, scales = "free") +
  geom_tile() +
  scale_fill_continuous(name = "HC value", type = "viridis") +
#  labs(title = "20 Years - HC (optimal 0)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
  scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
  ylab("number of clusters") +
  ylim(1,21) +
  theme_bw()

PAM.PBC %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
  #facet_grid(. ~ group, scales = "free") +
  geom_tile() +
  scale_fill_continuous(name = "PBC value", type = "viridis") +
#  labs(title = "20 Years - PBC (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
  scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
  ylab("number of clusters") +
  ylim(1,21) +
  theme_bw()

PAM.R2 %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
  #facet_grid(. ~ group, scales = "free") +
  geom_tile() +
  scale_fill_continuous(name = "R^2 value", type = "viridis") +
#  labs(title = "20 Years - R2 (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
  scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
  ylab("number of clusters") +
  ylim(1,21) +
  theme_bw()

## THIRTY YEARS -----------------
transition_matrix <- seqtrate(seq.30, weighted=FALSE, count=TRUE)
round(transition_matrix,2)
##                            [-> 1] [-> 2] [-> 3] [-> 4] [-> 5] [-> 6]
## [1 ->]                         97      6      2      4      2      3
## [2 ->]                          4   5211    202    248     41     55
## [3 ->]                          4    157   7945    419    101    128
## [4 ->]                          1    213    298  22431    476    326
## [5 ->]                          1     38     74    420   6484    118
## [6 ->]                          2     47    176    520    186   6299
## [Civilian unemployment ->]      3    267    307    672    221    234
##                            [-> Civilian unemployment]
## [1 ->]                                              4
## [2 ->]                                            185
## [3 ->]                                            260
## [4 ->]                                            493
## [5 ->]                                            140
## [6 ->]                                            781
## [Civilian unemployment ->]                       9118
diag(transition_matrix) = 0
round(transition_matrix,2)
##                            [-> 1] [-> 2] [-> 3] [-> 4] [-> 5] [-> 6]
## [1 ->]                          0      6      2      4      2      3
## [2 ->]                          4      0    202    248     41     55
## [3 ->]                          4    157      0    419    101    128
## [4 ->]                          1    213    298      0    476    326
## [5 ->]                          1     38     74    420      0    118
## [6 ->]                          2     47    176    520    186      0
## [Civilian unemployment ->]      3    267    307    672    221    234
##                            [-> Civilian unemployment]
## [1 ->]                                              4
## [2 ->]                                            185
## [3 ->]                                            260
## [4 ->]                                            493
## [5 ->]                                            140
## [6 ->]                                            781
## [Civilian unemployment ->]                          0
rowsum <- apply(transition_matrix, 1, sum)
for(i in 1:nrow(transition_matrix)) {
  for (j in 1:ncol(transition_matrix)){
    transition_matrix[i,j] = transition_matrix[i,j]/rowsum[i]
  }
}
for(j in 1:ncol(transition_matrix)){
      transition_matrix[7,j] = transition_matrix[7,j] = 0
}
transition_matrix <- round(transition_matrix,2)
cost_matrix <- 2-transition_matrix
round(cost_matrix,2)
##                            [-> 1] [-> 2] [-> 3] [-> 4] [-> 5] [-> 6]
## [1 ->]                       2.00   1.71   1.90   1.81   1.90   1.86
## [2 ->]                       1.99   2.00   1.73   1.66   1.94   1.93
## [3 ->]                       2.00   1.85   2.00   1.61   1.91   1.88
## [4 ->]                       2.00   1.88   1.84   2.00   1.74   1.82
## [5 ->]                       2.00   1.95   1.91   1.47   2.00   1.85
## [6 ->]                       2.00   1.97   1.90   1.70   1.89   2.00
## [Civilian unemployment ->]   2.00   2.00   2.00   2.00   2.00   2.00
##                            [-> Civilian unemployment]
## [1 ->]                                           1.81
## [2 ->]                                           1.75
## [3 ->]                                           1.76
## [4 ->]                                           1.73
## [5 ->]                                           1.82
## [6 ->]                                           1.54
## [Civilian unemployment ->]                       2.00
diag(cost_matrix) = 0
cost_matrix <- round(cost_matrix,2)

# OPTION 1 : OUR OM SPELL, LOW EXPANSION COST AND LOW WEIGHT
sequencing.opt1 <- seqdist(seq.30, method = "OMspell", indel = "auto", sm = cost_matrix, with.missing = TRUE, otto = 0, expcost = 0)

# OPTION 2 : OUR OM
sequencing.opt2 <- seqdist(seq.30, method = "OMstran", indel = "auto", sm = cost_matrix, with.missing = TRUE, otto = 0.5)

# OPTION 3 : OUR OM, LOW WEIGHT ON ORIGIN
sequencing.opt3 <- seqdist(seq.30, method = "OMstran", indel = 1, sm = cost_matrix, with.missing = TRUE, otto = 0.01)

# CLUSTERING
numbers <- (2:20)
pamstats <- NULL
#rownames(pamstats) <- numbers
pamstats <- NULL
for (number in numbers) {
  clusterpam <- wcKMedoids(sequencing.opt1, k = number)
  assign(paste0("sequencing.pam.opt1.", number), clusterpam, envir = .GlobalEnv)
  pamstats[[number]] <- clusterpam$stats
}
pamstats.opt1 <- dplyr::bind_rows(pamstats)
pamstats <- NULL
for (number in numbers) {
  clusterpam <- wcKMedoids(sequencing.opt2, k = number)
  assign(paste0("sequencing.pam.opt2.", number), clusterpam, envir = .GlobalEnv)
  pamstats[[number]] <- clusterpam$stats
}
pamstats.opt2 <- dplyr::bind_rows(pamstats)
for (number in numbers) {
  clusterpam <- wcKMedoids(sequencing.opt3, k = number)
  assign(paste0("sequencing.pam.opt3.", number), clusterpam, envir = .GlobalEnv)
  pamstats[[number]] <- clusterpam$stats
}
pamstats.opt3 <- dplyr::bind_rows(pamstats)

# QUALITY METRICS
rownames(pamstats.opt1) <- numbers
PAM.ASW.Opt1 <- as.matrix(pamstats.opt1$ASW)
colnames(PAM.ASW.Opt1) <- c("ASW opt1")
PAM.ASW.Opt2 <- as.matrix(pamstats.opt2$ASW)
colnames(PAM.ASW.Opt2) <- c("ASW opt2")
PAM.ASW.Opt3 <- as.matrix(pamstats.opt3$ASW)
colnames(PAM.ASW.Opt3) <- c("ASW opt3")
PAM.ASW <- bind_cols(PAM.ASW.Opt1, PAM.ASW.Opt2, PAM.ASW.Opt3)
PAM.ASW <- PAM.ASW %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.ASW$rowname <- as.numeric(PAM.ASW$rowname)

rownames(pamstats.opt1) <- numbers
PAM.HC.Opt1 <- as.matrix(pamstats.opt1$HC)
colnames(PAM.HC.Opt1) <- c("HC opt1")
PAM.HC.Opt2 <- as.matrix(pamstats.opt2$HC)
colnames(PAM.HC.Opt2) <- c("HC opt2")
PAM.HC.Opt3 <- as.matrix(pamstats.opt3$HC)
colnames(PAM.HC.Opt3) <- c("HC opt3")
PAM.HC <- bind_cols(PAM.HC.Opt1, PAM.HC.Opt2, PAM.HC.Opt3)
PAM.HC <- PAM.HC %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.HC$rowname <- as.numeric(PAM.HC$rowname)

rownames(pamstats.opt1) <- numbers
PAM.PBC.Opt1 <- as.matrix(pamstats.opt1$PBC)
colnames(PAM.PBC.Opt1) <- c("PBC opt1")
PAM.PBC.Opt2 <- as.matrix(pamstats.opt2$PBC)
colnames(PAM.PBC.Opt2) <- c("PBC opt2")
PAM.PBC.Opt3 <- as.matrix(pamstats.opt3$PBC)
colnames(PAM.PBC.Opt3) <- c("PBC opt3")
PAM.PBC <- bind_cols(PAM.PBC.Opt1, PAM.PBC.Opt2, PAM.PBC.Opt3)
PAM.PBC <- PAM.PBC %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.PBC$rowname <- as.numeric(PAM.PBC$rowname)

rownames(pamstats.opt1) <- numbers
PAM.R2.Opt1 <- as.matrix(pamstats.opt1$R2)
colnames(PAM.R2.Opt1) <- c("R2 opt1")
PAM.R2.Opt2 <- as.matrix(pamstats.opt2$R2)
colnames(PAM.R2.Opt2) <- c("R2 opt2")
PAM.R2.Opt3 <- as.matrix(pamstats.opt3$R2)
colnames(PAM.R2.Opt3) <- c("R2 opt3")
PAM.R2 <- bind_cols(PAM.R2.Opt1, PAM.R2.Opt2, PAM.R2.Opt3)
PAM.R2 <- PAM.R2 %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.R2$rowname <- as.numeric(PAM.R2$rowname)

PAM.ASW %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
  #facet_grid(. ~ group, scales = "free") +
  geom_tile() +
  scale_fill_continuous(name = "ASW value", type = "viridis") +
#  labs(title = "30 Years - ASW (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
  scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
  ylab("number of clusters") +
  ylim(1,21) +
  theme_bw()

PAM.HC %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
  #facet_grid(. ~ group, scales = "free") +
  geom_tile() +
  scale_fill_continuous(name = "HC value", type = "viridis") +
#  labs(title = "30 Years - HC (optimal 0)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
  scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
  ylab("number of clusters") +
  ylim(1,21) +
  theme_bw()

PAM.PBC %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
  #facet_grid(. ~ group, scales = "free") +
  geom_tile() +
  scale_fill_continuous(name = "PBC value", type = "viridis") +
#  labs(title = "30 Years - PBC (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
  scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
  ylab("number of clusters") +
  ylim(1,21) +
  theme_bw()

PAM.R2 %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
  #facet_grid(. ~ group, scales = "free") +
  geom_tile() +
  scale_fill_continuous(name = "R^2 value", type = "viridis") +
#  labs(title = "30 Years - R2 (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
#within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
  scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
  ylab("number of clusters") +
  ylim(1,21) +
  theme_bw()

# Taking notes on the high performers for 10, 20, and 30 years
## HIGH PERFORMERS:
## 10 YEARS: OPT 3 19-20, OPT 2 5, OPT 1 19-20
## 20 YEARS: OPT 2 6, OPT 2 19-20
## 30 YEARS: OPT 5, OPT 2 19-20
# Visualizing 10 years 20 cluster solution --------------------------------------------------
seq.10cluster_20$cluster$stats
##          PBC           HG         HGSD          ASW         ASWw           CH 
## 4.366448e-01 8.629729e-01 8.428392e-01 3.901766e-01 3.909038e-01 1.385791e+03 
##           R2         CHsq         R2sq           HC 
## 6.115862e-01 2.968367e+03 7.713105e-01 6.726921e-02
cluster_20 <- seq.10cluster_20$cluster$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.10)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)

all_df <- left_join(all_df, covariates, by = "id") %>% mutate(cluster_group = case_when(
  cluster %in% c(6776, 7088, 7089, 7138, 7143, 7189, 7196) ~ "education",
cluster %in% c(9881, 14946, 14955, 16178) ~ "promotion",
#cluster %in% c() ~ "demotion",
cluster %in% c(14833, 14954, 15114, 15790) ~ "unemployment",
cluster %in% c(10037, 14915, 14957, 16193, 16733) ~ "no change"
),
cluster_label = case_when(
  cluster == 6776 ~ "education to unemployment to education",
  cluster == 7088 ~ "education to zone 5",
  cluster == 7089 ~ "education to zone 4",
  cluster == 7138 ~ "education to zone 3",
  cluster == 7143 ~ "education to unemployment",
  
  cluster == 7189 ~ "zone 4 to education to zone 4",
  cluster == 7196 ~ "education to unemployment to zone 4",
  cluster == 9881 ~ "zone 4 to zone 5",
  cluster == 10037 ~ "zone 5",
  cluster == 14833 ~ "zone 4 to unemployment to zone 4",
  
  cluster == 14915 ~ "zone 4 to zone 3 to zone 4",
  cluster == 14946 ~ "zone 2 to zone 4",
  cluster == 14954 ~ "zone 4 to unemployment",
  cluster == 14955 ~ "zone 3 to zone 4",
  cluster == 14957 ~ "zone 4",
  
  cluster == 15114 ~ "zone 2 to unemployment to zone 2",
  cluster == 15790 ~ "zone 3 to unemployment to zone 3",
  cluster == 16178 ~ "zone 2 to zone 3",
  cluster == 16193 ~ "zone 3",
  cluster == 16733 ~ "zone 2"
), cluster_long = paste0(cluster_group, " - ", cluster_label))
all_df <- all_df %>% left_join(highest_degree_from_forprofit, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_hbcu, by = "id")
all_df <- all_df %>% mutate(irr_bin = ntile(irr, 5))
dataset_top_skills$id <- as.character(dataset_top_skills$id)
all_df <- all_df %>% left_join(dataset_top_skills, by = "id")
dataset_top_majors$id <- as.character(dataset_top_majors$id)
all_df <- all_df %>% left_join(dataset_top_majors, by = "id")
dataset_all_skills$id <- as.character(dataset_all_skills$id)
all_df <- all_df %>% left_join(dataset_all_skills, by = "id")
dataset_all_majors$id <- as.character(dataset_all_majors$id)
all_df <- all_df %>% left_join(dataset_all_majors, by = "id")

all_df <- all_df %>% mutate(
          year_entered_workforce_bin = factor(year_entered_workforce_bin, levels = c("Silent and Greatest", "Baby Boom", "Generation X", "Millennial")),
highest_degree = factor(highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")), 
highest_degree_from_forprofit = factor(highest_degree_from_forprofit, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_hbcu = factor(highest_degree_from_hbcu, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")))

seqdplot(seq.10, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5"))

seqrplot(seq.10, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start", diss = seq.10cluster_20$diss,
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5"))

cluster_membership_visualization(veteran)

cluster_membership_visualization(highest_degree)

cluster_membership_visualization(gender)

cluster_membership_visualization(irr_bin)

cluster_membership_visualization(highest_degree_from_forprofit)

cluster_membership_visualization(highest_degree_from_hbcu)

cluster_membership_visualization(year_entered_workforce_bin)

cluster_membership_visualization_skillmajor(top_skill)

cluster_membership_visualization_skillmajor(top_major)

cluster_membership_visualization_skillmajor(skill)

cluster_membership_visualization_skillmajor(major)

data_profiling(all_df)
## # A tibble: 34 x 5
##    var            variable_type num_unique num_missing perc_missing
##    <chr>          <chr>              <int>       <int>        <dbl>
##  1 id             character          16742           0         0   
##  2 cluster        character             20           0         0   
##  3 veteran        character              2           0         0   
##  4 gender         character              3       27946        13.5 
##  5 zipcode        integer              861           0         0   
##  6 irr            numeric              377           7         0   
##  7 highest_degree factor                 7       47212        22.8 
##  8 forprofit      character              3          22         0.01
##  9 hbcu           character              3          22         0.01
## 10 tribal         character              3          22         0.01
## # … with 24 more rows
  df <- all_df %>%
  filter(!is.na(top_skill)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_skill) %>%
  group_by(top_skill) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
  group_by(cluster) %>%
  mutate(cluster_total = n()) %>%
  group_by(cluster, top_skill) %>%
  mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
  distinct(top_skill, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
  group_by(top_skill) %>% mutate(sum = sum(percent))

df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_skill))
df <- as.data.frame(df)

df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 100 x 3
## # Groups:   cluster_long [20]
##    cluster_long                                       top_skill     difference
##    <chr>                                              <chr>              <dbl>
##  1 education - education to unemployment              architecture       0.760
##  2 education - education to unemployment              industry           0.743
##  3 education - education to unemployment              economics          0.695
##  4 education - education to unemployment              manufacturing      0.646
##  5 education - education to unemployment              analysis           0.618
##  6 education - education to unemployment to education science            2.54 
##  7 education - education to unemployment to education energy             1.24 
##  8 education - education to unemployment to education legal              0.783
##  9 education - education to unemployment to education environment        0.544
## 10 education - education to unemployment to education media              0.544
## # … with 90 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 100 x 3
## # Groups:   cluster_long [20]
##    cluster_long                                       top_skill    difference
##    <chr>                                              <chr>             <dbl>
##  1 education - education to unemployment              security         -0.489
##  2 education - education to unemployment              human            -0.398
##  3 education - education to unemployment              marketing        -0.266
##  4 education - education to unemployment              information      -0.244
##  5 education - education to unemployment              business         -0.125
##  6 education - education to unemployment to education architecture     -0.839
##  7 education - education to unemployment to education supply           -0.587
##  8 education - education to unemployment to education maintenance      -0.532
##  9 education - education to unemployment to education design           -0.464
## 10 education - education to unemployment to education sales            -0.364
## # … with 90 more rows
  df <- all_df %>%
  filter(!is.na(top_major)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_major) %>%
  group_by(top_major) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
  group_by(cluster) %>%
  mutate(cluster_total = n()) %>%
  group_by(cluster, top_major) %>%
  mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
  distinct(top_major, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
  group_by(top_major) %>% mutate(sum = sum(percent))

df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_major))
df <- as.data.frame(df)

df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 100 x 3
## # Groups:   cluster_long [20]
##    cluster_long                                  top_major            difference
##    <chr>                                         <chr>                     <dbl>
##  1 education - education to unemployment         natural_resource           2.11
##  2 education - education to unemployment         library                    2.11
##  3 education - education to unemployment         culinary_entertainm…       1.80
##  4 education - education to unemployment         family_consumer            1.42
##  5 education - education to unemployment         education                  1.12
##  6 education - education to unemployment to edu… leisure_recreation         3.23
##  7 education - education to unemployment to edu… natural_resource           3.23
##  8 education - education to unemployment to edu… law                        3.17
##  9 education - education to unemployment to edu… military_science           2.81
## 10 education - education to unemployment to edu… biology                    1.81
## # … with 90 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 101 x 3
## # Groups:   cluster_long [20]
##    cluster_long                                  top_major            difference
##    <chr>                                         <chr>                     <dbl>
##  1 education - education to unemployment         computer                 -0.340
##  2 education - education to unemployment         communication            -0.308
##  3 education - education to unemployment         homeland_security        -0.253
##  4 education - education to unemployment         architecture             -0.189
##  5 education - education to unemployment         engineering_tech         -0.186
##  6 education - education to unemployment to edu… mathematics              -0.698
##  7 education - education to unemployment to edu… recreation_kinesiol…     -0.634
##  8 education - education to unemployment to edu… education                -0.567
##  9 education - education to unemployment to edu… english                  -0.499
## 10 education - education to unemployment to edu… homeland_security        -0.492
## # … with 91 more rows
# Visualizing 20 years 20 cluster solution --------------------------------------------------
seq.20cluster_20$cluster$stats
##          PBC           HG         HGSD          ASW         ASWw           CH 
##   0.55367685   0.90600478   0.90594103   0.26188038   0.26406106 385.90288875 
##           R2         CHsq         R2sq           HC 
##   0.50632390 952.64294089   0.71686249   0.08681695
cluster_20 <- seq.20cluster_20$cluster$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.10)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)

all_df <- left_join(all_df, covariates, by = "id") %>% mutate(cluster_group = case_when(
  cluster %in% c(60, 95, 299, 1751, 1840, 2627, 2787, 3242) ~ "education",
cluster %in% c(3346, 5976, 6637, 6664) ~ "promotion",
cluster %in% c(4584, 6386) ~ "demotion",
cluster %in% c(5987, 6718) ~ "unemployment",
cluster %in% c(4630, 6650, 7034, 7168) ~ "no change"
),
cluster_label = case_when(
  cluster == 60 ~ "education to zone 5",
  cluster == 95 ~ "education to zone 4",
  cluster == 299 ~ "education to unemployment to zone 4",
  cluster == 1751 ~ "education to unemployment to zone 3",
  cluster == 1840 ~ "education to unemployment to zone 2",
  
  cluster == 2627 ~ "education to zone 4 to education to zone 4",
  cluster == 2787 ~ "education to unemployment",
  cluster == 3242 ~ "education to zone 4",
  cluster == 3346 ~ "zone 4 to zone 5",
  cluster == 4584 ~ "zone 5 to zone 4",
  
  cluster == 4630 ~ "zone 5",
  cluster == 5976 ~ "zone 2 to zone 4",
  cluster == 5987 ~ "zone 4 to unemployment to zone 4",
  cluster == 6386 ~ "zone 4 to zone 3",
  cluster == 6637 ~ "zone 3 to zone 4",
  
  cluster == 6650 ~ "zone 4",
  cluster == 6664 ~ "zone 2 to zone 3",
  cluster == 6718 ~ "zone 3 to unemployment to zone 3",
  cluster == 7034 ~ "zone 3",
  cluster == 7168 ~ "zone 2"
), cluster_long = paste0(cluster_group, " - ", cluster_label))
all_df <- all_df %>% left_join(highest_degree_from_forprofit, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_hbcu, by = "id")
all_df <- all_df %>% mutate(irr_bin = ntile(irr, 5))
dataset_top_skills$id <- as.character(dataset_top_skills$id)
all_df <- all_df %>% left_join(dataset_top_skills, by = "id")
dataset_top_majors$id <- as.character(dataset_top_majors$id)
all_df <- all_df %>% left_join(dataset_top_majors, by = "id")
dataset_all_skills$id <- as.character(dataset_all_skills$id)
all_df <- all_df %>% left_join(dataset_all_skills, by = "id")
dataset_all_majors$id <- as.character(dataset_all_majors$id)
all_df <- all_df %>% left_join(dataset_all_majors, by = "id")

all_df <- all_df %>% mutate(
          year_entered_workforce_bin = factor(year_entered_workforce_bin, levels = c("Silent and Greatest", "Baby Boom", "Generation X", "Millennial")),
highest_degree = factor(highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")), 
highest_degree_from_forprofit = factor(highest_degree_from_forprofit, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_hbcu = factor(highest_degree_from_hbcu, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")))

seqdplot(seq.20, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5", "dark gray"))

seqrplot(seq.20, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start", diss = seq.20cluster_20$diss,
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5"))

cluster_membership_visualization(veteran)

cluster_membership_visualization(highest_degree)

cluster_membership_visualization(gender)

cluster_membership_visualization(irr_bin)

cluster_membership_visualization(highest_degree_from_forprofit)

cluster_membership_visualization(highest_degree_from_hbcu)

cluster_membership_visualization(year_entered_workforce_bin)

cluster_membership_visualization_skillmajor(top_skill)

cluster_membership_visualization_skillmajor(top_major)

cluster_membership_visualization_skillmajor(skill)

cluster_membership_visualization_skillmajor(major)

  df <- all_df %>%
  filter(!is.na(top_skill)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_skill) %>%
  group_by(top_skill) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
  group_by(cluster) %>%
  mutate(cluster_total = n()) %>%
  group_by(cluster, top_skill) %>%
  mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
  distinct(top_skill, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
  group_by(top_skill) %>% mutate(sum = sum(percent))

df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_skill))
df <- as.data.frame(df)

df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 95 x 3
## # Groups:   cluster_long [19]
##    cluster_long                top_skill    difference
##    <chr>                       <chr>             <dbl>
##  1 demotion - zone 4 to zone 3 engineering       1.09 
##  2 demotion - zone 4 to zone 3 media             0.884
##  3 demotion - zone 4 to zone 3 legal             0.496
##  4 demotion - zone 4 to zone 3 maintenance       0.468
##  5 demotion - zone 4 to zone 3 architecture      0.371
##  6 demotion - zone 5 to zone 4 personal          1.62 
##  7 demotion - zone 5 to zone 4 analysis          0.883
##  8 demotion - zone 5 to zone 4 design            0.830
##  9 demotion - zone 5 to zone 4 agriculture       0.758
## 10 demotion - zone 5 to zone 4 education         0.400
## # … with 85 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 95 x 3
## # Groups:   cluster_long [19]
##    cluster_long                top_skill     difference
##    <chr>                       <chr>              <dbl>
##  1 demotion - zone 4 to zone 3 personal          -0.590
##  2 demotion - zone 4 to zone 3 health            -0.270
##  3 demotion - zone 4 to zone 3 manufacturing     -0.267
##  4 demotion - zone 4 to zone 3 education         -0.204
##  5 demotion - zone 4 to zone 3 design            -0.183
##  6 demotion - zone 5 to zone 4 security          -0.656
##  7 demotion - zone 5 to zone 4 environment       -0.498
##  8 demotion - zone 5 to zone 4 manufacturing     -0.414
##  9 demotion - zone 5 to zone 4 supply            -0.275
## 10 demotion - zone 5 to zone 4 business          -0.261
## # … with 85 more rows
  df <- all_df %>%
  filter(!is.na(top_major)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_major) %>%
  group_by(top_major) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
  group_by(cluster) %>%
  mutate(cluster_total = n()) %>%
  group_by(cluster, top_major) %>%
  mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
  distinct(top_major, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
  group_by(top_major) %>% mutate(sum = sum(percent))

df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_major))
df <- as.data.frame(df)

df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 97 x 3
## # Groups:   cluster_long [19]
##    cluster_long                top_major                difference
##    <chr>                       <chr>                         <dbl>
##  1 demotion - zone 4 to zone 3 construction                 18.2  
##  2 demotion - zone 4 to zone 3 cultural_gender               2.03 
##  3 demotion - zone 4 to zone 3 english                       1.02 
##  4 demotion - zone 4 to zone 3 culinary_entertainment        0.918
##  5 demotion - zone 4 to zone 3 architecture                  0.668
##  6 demotion - zone 5 to zone 4 transportation_materials      2.79 
##  7 demotion - zone 5 to zone 4 education                     2.10 
##  8 demotion - zone 5 to zone 4 family_consumer               1.53 
##  9 demotion - zone 5 to zone 4 mathematics                   1.17 
## 10 demotion - zone 5 to zone 4 law                           1.13 
## # … with 87 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 96 x 3
## # Groups:   cluster_long [19]
##    cluster_long                top_major              difference
##    <chr>                       <chr>                       <dbl>
##  1 demotion - zone 4 to zone 3 visual_arts                -0.436
##  2 demotion - zone 4 to zone 3 psychology                 -0.312
##  3 demotion - zone 4 to zone 3 health_professions         -0.274
##  4 demotion - zone 4 to zone 3 engineering_tech           -0.228
##  5 demotion - zone 4 to zone 3 public_administration      -0.195
##  6 demotion - zone 5 to zone 4 homeland_security          -0.545
##  7 demotion - zone 5 to zone 4 social_sciences            -0.462
##  8 demotion - zone 5 to zone 4 engineering                -0.445
##  9 demotion - zone 5 to zone 4 communication              -0.393
## 10 demotion - zone 5 to zone 4 recreation_kinesiology     -0.344
## # … with 86 more rows
# Visualizing 30 years 10 cluster solution --------------------------------------------------
seq.30cluster_10$cluster$stats
##         PBC          HG        HGSD         ASW        ASWw          CH 
##   0.5416033   0.8251470   0.8251211   0.2054577   0.2089100 167.1695255 
##          R2        CHsq        R2sq          HC 
##   0.4011506 402.1089344   0.6170490   0.1333264
cluster_20 <- seq.30cluster_10$cluster$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.10)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)

all_df <- left_join(all_df, covariates, by = "id") %>% mutate(cluster_group = case_when(
  cluster %in% c(380, 429, 506, 880, 1109) ~ "education",
cluster %in% c(2089) ~ "promotion",
#cluster %in% c() ~ "demotion",
#cluster %in% c() ~ "unemployment",
cluster %in% c(584, 2126, 2216, 2255) ~ "no change"
),
cluster_label = case_when(
  cluster == 380 ~ "education to zone 4",
  cluster == 429 ~ "education to unemployment to zone 3",
  cluster == 506 ~ "education to zone 4 to zone 5",
  cluster == 584 ~ "zone 5",
  cluster == 880 ~ "education to unemployment to zone 4",
  
  cluster == 1109 ~ "education to unemployment",
  cluster == 2089 ~ "zone 3 to zone 4",
  cluster == 2126 ~ "zone 4",
  cluster == 2216 ~ "zone 3",
  cluster == 2255 ~ "zone 2",
), cluster_long = paste0(cluster_group, " - ", cluster_label))
all_df <- all_df %>% left_join(highest_degree_from_forprofit, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_hbcu, by = "id")
all_df <- all_df %>% mutate(irr_bin = ntile(irr, 5))
dataset_top_skills$id <- as.character(dataset_top_skills$id)
all_df <- all_df %>% left_join(dataset_top_skills, by = "id")
dataset_top_majors$id <- as.character(dataset_top_majors$id)
all_df <- all_df %>% left_join(dataset_top_majors, by = "id")
dataset_all_skills$id <- as.character(dataset_all_skills$id)
all_df <- all_df %>% left_join(dataset_all_skills, by = "id")
dataset_all_majors$id <- as.character(dataset_all_majors$id)
all_df <- all_df %>% left_join(dataset_all_majors, by = "id")

all_df <- all_df %>% mutate(
          year_entered_workforce_bin = factor(year_entered_workforce_bin, levels = c("Silent and Greatest", "Baby Boom", "Generation X", "Millennial")),
highest_degree = factor(highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")), 
highest_degree_from_forprofit = factor(highest_degree_from_forprofit, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_hbcu = factor(highest_degree_from_hbcu, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")))

seqdplot(seq.30, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5"))

seqrplot(seq.30, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start", diss = seq.30cluster_10$diss,
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5"))

cluster_membership_visualization(veteran)

cluster_membership_visualization(highest_degree)

cluster_membership_visualization(gender)

cluster_membership_visualization(irr_bin)

cluster_membership_visualization(highest_degree_from_forprofit)

cluster_membership_visualization(highest_degree_from_hbcu)

cluster_membership_visualization(year_entered_workforce_bin)

cluster_membership_visualization_skillmajor(top_skill)

cluster_membership_visualization_skillmajor(top_major)

cluster_membership_visualization_skillmajor(skill)

cluster_membership_visualization_skillmajor(major)

  df <- all_df %>%
  filter(!is.na(top_skill)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_skill) %>%
  group_by(top_skill) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
  group_by(cluster) %>%
  mutate(cluster_total = n()) %>%
  group_by(cluster, top_skill) %>%
  mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
  distinct(top_skill, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
  group_by(top_skill) %>% mutate(sum = sum(percent))

df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_skill))
df <- as.data.frame(df)

df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 50 x 3
## # Groups:   cluster_long [10]
##    cluster_long                                    top_skill   difference
##    <chr>                                           <chr>            <dbl>
##  1 education - education to unemployment           environment      0.462
##  2 education - education to unemployment           personal         0.388
##  3 education - education to unemployment           customer         0.338
##  4 education - education to unemployment           science          0.235
##  5 education - education to unemployment           sales            0.127
##  6 education - education to unemployment to zone 3 agriculture      0.717
##  7 education - education to unemployment to zone 3 environment      0.472
##  8 education - education to unemployment to zone 3 media            0.367
##  9 education - education to unemployment to zone 3 design           0.341
## 10 education - education to unemployment to zone 3 supply           0.201
## # … with 40 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 50 x 3
## # Groups:   cluster_long [10]
##    cluster_long                                    top_skill    difference
##    <chr>                                           <chr>             <dbl>
##  1 education - education to unemployment           energy           -0.423
##  2 education - education to unemployment           industry         -0.302
##  3 education - education to unemployment           architecture     -0.295
##  4 education - education to unemployment           design           -0.273
##  5 education - education to unemployment           legal            -0.252
##  6 education - education to unemployment to zone 3 industry         -0.356
##  7 education - education to unemployment to zone 3 science          -0.240
##  8 education - education to unemployment to zone 3 legal            -0.203
##  9 education - education to unemployment to zone 3 economics        -0.165
## 10 education - education to unemployment to zone 3 business         -0.160
## # … with 40 more rows
  df <- all_df %>%
  filter(!is.na(top_major)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_major) %>%
  group_by(top_major) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
  group_by(cluster) %>%
  mutate(cluster_total = n()) %>%
  group_by(cluster, top_major) %>%
  mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
  distinct(top_major, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
  group_by(top_major) %>% mutate(sum = sum(percent))

df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_major))
df <- as.data.frame(df)

df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 50 x 3
## # Groups:   cluster_long [10]
##    cluster_long                                top_major              difference
##    <chr>                                       <chr>                       <dbl>
##  1 education - education to unemployment       transportation_materi…      1.73 
##  2 education - education to unemployment       architecture                1.14 
##  3 education - education to unemployment       leisure_recreation          0.821
##  4 education - education to unemployment       military_science            0.639
##  5 education - education to unemployment       family_consumer             0.518
##  6 education - education to unemployment to z… library                     4.41 
##  7 education - education to unemployment to z… military_science            2.25 
##  8 education - education to unemployment to z… philosophy_religion         1.16 
##  9 education - education to unemployment to z… natural_resource            0.803
## 10 education - education to unemployment to z… english                     0.708
## # … with 40 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 50 x 3
## # Groups:   cluster_long [10]
##    cluster_long                                  top_major            difference
##    <chr>                                         <chr>                     <dbl>
##  1 education - education to unemployment         culinary_entertainm…     -0.590
##  2 education - education to unemployment         agriculture_veterin…     -0.415
##  3 education - education to unemployment         cultural_gender          -0.353
##  4 education - education to unemployment         visual_arts              -0.277
##  5 education - education to unemployment         communication            -0.246
##  6 education - education to unemployment to zon… education                -0.631
##  7 education - education to unemployment to zon… mathematics              -0.614
##  8 education - education to unemployment to zon… family_consumer          -0.399
##  9 education - education to unemployment to zon… humanities               -0.340
## 10 education - education to unemployment to zon… architecture             -0.294
## # … with 40 more rows
# 15 YEARS EXPERIENCE ------------------------------------------------------------------------
# Creating sequence objects fo 10, 20, and 30 years
seq.10 <- create_seq_all_fixed("seq.10", 10, left_unemp = FALSE, 15)
seq.20 <- create_seq_all_fixed("seq.20", 20, left_unemp = FALSE, 15)
seq.30 <- create_seq_all_fixed("seq.30", 30, left_unemp = FALSE, 15)

# Clustering sequnces
seq.10cluster_20 <- cluster("cluster_10", seq.10, sequence_option = "option 1", 20)
seq.20cluster_20 <- cluster("cluster_10", seq.20, sequence_option = "option 2", 20)
seq.30cluster_10 <- cluster("cluster_10", seq.30, sequence_option = "option 2", 10)
# Visualizing 10 years 20 cluster solution --------------------------------------------------
seq.10cluster_20$cluster$stats
##          PBC           HG         HGSD          ASW         ASWw           CH 
## 4.652542e-01 8.711621e-01 8.533859e-01 4.448714e-01 4.464166e-01 7.324497e+02 
##           R2         CHsq         R2sq           HC 
## 6.343133e-01 1.364724e+03 7.637011e-01 4.976412e-02
cluster_20 <- seq.10cluster_20$cluster$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.10)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)

all_df <- left_join(all_df, covariates, by = "id") %>% mutate(cluster_group = case_when(
  cluster %in% c(483, 1470, 2107, 2136, 2154) ~ "education",
cluster %in% c(4183, 7140, 7149, 7746) ~ "promotion",
cluster %in% c(4172, 7133) ~ "demotion",
cluster %in% c(4166, 6940, 7114, 7725) ~ "unemployment",
cluster %in% c(4139, 7116, 7151, 7761, 8042) ~ "no change"
),
cluster_label = case_when(
  cluster == 483 ~ "education to zone 5",
  cluster == 1470 ~ "zone 4 to education to zone 4",
  cluster == 2107 ~ "zone 3 to education to zone 3",
  cluster == 2136 ~ "zone 4 to education",
  cluster == 2154 ~ "zone 4 to education to zone 4",
  
  cluster == 4139 ~ "zone 4 to zone 5 to zone 4",
  cluster == 4166 ~ "zone 5 to unemployment",
  cluster == 4172 ~ "zone 5 to zone 4",
  cluster == 4183 ~ "zone 4 to zone 5",
  cluster == 6940 ~ "zone 4 to unemployment to zone 4",
  
  cluster == 7114 ~ "zone 4 to unemployment",
  cluster == 7116 ~ "zone 4 to zone 3 to zone 4",
  cluster == 7133 ~ "zone 4 to zone 3",
  cluster == 7140 ~ "zone 2 to zone 4",
  cluster == 7149 ~ "zone 3 to zone 4",
  
  cluster == 7151 ~ "zone 4",
  cluster == 7725 ~ "zone 3 to unemployment",
  cluster == 7746 ~ "zone 2 to zone 3",
  cluster == 7761 ~ "zone 3",
  cluster == 8042 ~ "zone 2"
), cluster_long = paste0(cluster_group, " - ", cluster_label))
all_df <- all_df %>% left_join(highest_degree_from_forprofit, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_hbcu, by = "id")
all_df <- all_df %>% mutate(irr_bin = ntile(irr, 5))
dataset_top_skills$id <- as.character(dataset_top_skills$id)
all_df <- all_df %>% left_join(dataset_top_skills, by = "id")
dataset_top_majors$id <- as.character(dataset_top_majors$id)
all_df <- all_df %>% left_join(dataset_top_majors, by = "id")
dataset_all_skills$id <- as.character(dataset_all_skills$id)
all_df <- all_df %>% left_join(dataset_all_skills, by = "id")
dataset_all_majors$id <- as.character(dataset_all_majors$id)
all_df <- all_df %>% left_join(dataset_all_majors, by = "id")

all_df <- all_df %>% mutate(
          year_entered_workforce_bin = factor(year_entered_workforce_bin, levels = c("Silent and Greatest", "Baby Boom", "Generation X", "Millennial")),
highest_degree = factor(highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")), 
highest_degree_from_forprofit = factor(highest_degree_from_forprofit, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_hbcu = factor(highest_degree_from_hbcu, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")))

seqdplot(seq.10, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5", "dark gray"))

seqrplot(seq.10, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start", diss = seq.10cluster_20$diss,
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5"))

cluster_membership_visualization(veteran)

cluster_membership_visualization(highest_degree)

cluster_membership_visualization(gender)

cluster_membership_visualization(irr_bin)

cluster_membership_visualization(highest_degree_from_forprofit)

cluster_membership_visualization(highest_degree_from_hbcu)

cluster_membership_visualization(year_entered_workforce_bin)

cluster_membership_visualization_skillmajor(top_skill)

cluster_membership_visualization_skillmajor(top_major)

cluster_membership_visualization_skillmajor(skill)

cluster_membership_visualization_skillmajor(major)

  df <- all_df %>%
  filter(!is.na(top_skill)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_skill) %>%
  group_by(top_skill) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
  group_by(cluster) %>%
  mutate(cluster_total = n()) %>%
  group_by(cluster, top_skill) %>%
  mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
  distinct(top_skill, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
  group_by(top_skill) %>% mutate(sum = sum(percent))

df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_skill))
df <- as.data.frame(df)

df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 95 x 3
## # Groups:   cluster_long [19]
##    cluster_long                top_skill    difference
##    <chr>                       <chr>             <dbl>
##  1 demotion - zone 4 to zone 3 analysis          0.945
##  2 demotion - zone 4 to zone 3 design            0.815
##  3 demotion - zone 4 to zone 3 engineering       0.760
##  4 demotion - zone 4 to zone 3 maintenance       0.535
##  5 demotion - zone 4 to zone 3 customer          0.496
##  6 demotion - zone 5 to zone 4 economics         2.53 
##  7 demotion - zone 5 to zone 4 engineering       1.12 
##  8 demotion - zone 5 to zone 4 marketing         0.476
##  9 demotion - zone 5 to zone 4 architecture      0.396
## 10 demotion - zone 5 to zone 4 business          0.305
## # … with 85 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 95 x 3
## # Groups:   cluster_long [19]
##    cluster_long                top_skill     difference
##    <chr>                       <chr>              <dbl>
##  1 demotion - zone 4 to zone 3 marketing         -0.700
##  2 demotion - zone 4 to zone 3 business          -0.435
##  3 demotion - zone 4 to zone 3 manufacturing     -0.428
##  4 demotion - zone 4 to zone 3 media             -0.381
##  5 demotion - zone 4 to zone 3 health            -0.231
##  6 demotion - zone 5 to zone 4 maintenance       -0.580
##  7 demotion - zone 5 to zone 4 environment       -0.468
##  8 demotion - zone 5 to zone 4 manufacturing     -0.437
##  9 demotion - zone 5 to zone 4 finance           -0.282
## 10 demotion - zone 5 to zone 4 customer          -0.264
## # … with 85 more rows
  df <- all_df %>%
  filter(!is.na(top_major)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_major) %>%
  group_by(top_major) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
  group_by(cluster) %>%
  mutate(cluster_total = n()) %>%
  group_by(cluster, top_major) %>%
  mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
  distinct(top_major, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
  group_by(top_major) %>% mutate(sum = sum(percent))

df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_major))
df <- as.data.frame(df)

df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 96 x 3
## # Groups:   cluster_long [19]
##    cluster_long                top_major              difference
##    <chr>                       <chr>                       <dbl>
##  1 demotion - zone 4 to zone 3 culinary_entertainment       7.32
##  2 demotion - zone 4 to zone 3 philosophy_religion          1.45
##  3 demotion - zone 4 to zone 3 interdisciplinary            1.31
##  4 demotion - zone 4 to zone 3 visual_arts                  1.31
##  5 demotion - zone 4 to zone 3 law                          1.31
##  6 demotion - zone 5 to zone 4 architecture                 3.83
##  7 demotion - zone 5 to zone 4 agriculture_veterinary       1.61
##  8 demotion - zone 5 to zone 4 interdisciplinary            1.32
##  9 demotion - zone 5 to zone 4 cultural_gender              1.32
## 10 demotion - zone 5 to zone 4 law                          1.32
## # … with 86 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 95 x 3
## # Groups:   cluster_long [19]
##    cluster_long                top_major             difference
##    <chr>                       <chr>                      <dbl>
##  1 demotion - zone 4 to zone 3 communication             -0.685
##  2 demotion - zone 4 to zone 3 engineering               -0.593
##  3 demotion - zone 4 to zone 3 psychology                -0.524
##  4 demotion - zone 4 to zone 3 public_administration     -0.492
##  5 demotion - zone 4 to zone 3 language_linguistics      -0.184
##  6 demotion - zone 5 to zone 4 physics                   -0.782
##  7 demotion - zone 5 to zone 4 health_professions        -0.598
##  8 demotion - zone 5 to zone 4 humanities                -0.346
##  9 demotion - zone 5 to zone 4 business                  -0.303
## 10 demotion - zone 5 to zone 4 computer                  -0.289
## # … with 85 more rows
# Visualizing 20 years 20 cluster solution --------------------------------------------------
seq.20cluster_20$cluster$stats
##          PBC           HG         HGSD          ASW         ASWw           CH 
##   0.59830170   0.90939171   0.90930998   0.28257954   0.28884498 135.12292411 
##           R2         CHsq         R2sq           HC 
##   0.50445398 316.19414619   0.70432684   0.06991502
cluster_20 <- seq.20cluster_20$cluster$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.10)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)

all_df <- left_join(all_df, covariates, by = "id") %>% mutate(cluster_group = case_when(
  cluster %in% c(123, 303, 530, 639) ~ "education",
cluster %in% c(1004, 1196, 1598, 2244, 2381) ~ "promotion",
cluster %in% c(965, 1541, 2172, 2487, 2542) ~ "demotion",
cluster %in% c(958, 1822, 2310) ~ "unemployment",
cluster %in% c(1452, 1461, 2371) ~ "no change"
),
cluster_label = case_when(
  cluster == 123 ~ "education to unemployment",
  cluster == 303 ~ "zone 4 to education to zone 4",
  cluster == 530 ~ "education to zone 5",
  cluster == 639 ~ "education to zone 4",
  cluster == 958 ~ "zone 5 to unemployment to zone 5",
  
  cluster == 965 ~ "zone 5 to zone 4",
  cluster == 1004 ~ "zone 4 to zone 5 (Late career)",
  cluster == 1196 ~ "zone 4 to zone 5 (Early career)",
  cluster == 1452 ~ "zone 4 to zone 5 to zone 4",
  cluster == 1461 ~ "zone 5",
  
  cluster == 1541 ~ "zone 4 to zone 3",
  cluster == 1598 ~ "zone 3 to zone 4",
  cluster == 1822 ~ "zone 4 to unemployment",
  cluster == 2172 ~ "zone 4 to zone 2",
  cluster == 2244 ~ "zone 2 to zone 4",
  
  cluster == 2310 ~ "zone 4 to unemployment to zone 4",
  cluster == 2371 ~ "zone 4",
  cluster == 2381 ~ "zone 2 to zone 3",
  cluster == 2487 ~ "zone 4 to zone 3",
  cluster == 2542 ~ "zone 4 to zone 2"
), cluster_long = paste0(cluster_group, " - ", cluster_label))
all_df <- all_df %>% left_join(highest_degree_from_forprofit, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_hbcu, by = "id")
all_df <- all_df %>% mutate(irr_bin = ntile(irr, 5))
dataset_top_skills$id <- as.character(dataset_top_skills$id)
all_df <- all_df %>% left_join(dataset_top_skills, by = "id")
dataset_top_majors$id <- as.character(dataset_top_majors$id)
all_df <- all_df %>% left_join(dataset_top_majors, by = "id")
dataset_all_skills$id <- as.character(dataset_all_skills$id)
all_df <- all_df %>% left_join(dataset_all_skills, by = "id")
dataset_all_majors$id <- as.character(dataset_all_majors$id)
all_df <- all_df %>% left_join(dataset_all_majors, by = "id")

all_df <- all_df %>% mutate(
          year_entered_workforce_bin = factor(year_entered_workforce_bin, levels = c("Silent and Greatest", "Baby Boom", "Generation X", "Millennial")),
highest_degree = factor(highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")), 
highest_degree_from_forprofit = factor(highest_degree_from_forprofit, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_hbcu = factor(highest_degree_from_hbcu, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")))

seqdplot(seq.20, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5", "dark gray"))

seqrplot(seq.20, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start", diss = seq.20cluster_20$diss,
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5"))

cluster_membership_visualization(veteran)

cluster_membership_visualization(highest_degree)

cluster_membership_visualization(gender)

cluster_membership_visualization(irr_bin)

cluster_membership_visualization(highest_degree_from_forprofit)

cluster_membership_visualization(highest_degree_from_hbcu)

cluster_membership_visualization(year_entered_workforce_bin)

cluster_membership_visualization_skillmajor(top_skill)

cluster_membership_visualization_skillmajor(top_major)

cluster_membership_visualization_skillmajor(skill)

cluster_membership_visualization_skillmajor(major)

  df <- all_df %>%
  filter(!is.na(top_skill)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_skill) %>%
  group_by(top_skill) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
  group_by(cluster) %>%
  mutate(cluster_total = n()) %>%
  group_by(cluster, top_skill) %>%
  mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
  distinct(top_skill, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
  group_by(top_skill) %>% mutate(sum = sum(percent))

df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_skill))
df <- as.data.frame(df)

df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 90 x 3
## # Groups:   cluster_long [18]
##    cluster_long                top_skill     difference
##    <chr>                       <chr>              <dbl>
##  1 demotion - zone 4 to zone 2 analysis           4.36 
##  2 demotion - zone 4 to zone 2 agriculture        1.84 
##  3 demotion - zone 4 to zone 2 economics          1.47 
##  4 demotion - zone 4 to zone 2 environment        1.23 
##  5 demotion - zone 4 to zone 2 manufacturing      1.00 
##  6 demotion - zone 4 to zone 3 science            1.35 
##  7 demotion - zone 4 to zone 3 environment        0.946
##  8 demotion - zone 4 to zone 3 personal           0.747
##  9 demotion - zone 4 to zone 3 industry           0.733
## 10 demotion - zone 4 to zone 3 manufacturing      0.488
## # … with 80 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 91 x 3
## # Groups:   cluster_long [18]
##    cluster_long                top_skill     difference
##    <chr>                       <chr>              <dbl>
##  1 demotion - zone 4 to zone 2 human             -0.797
##  2 demotion - zone 4 to zone 2 engineering       -0.596
##  3 demotion - zone 4 to zone 2 education         -0.526
##  4 demotion - zone 4 to zone 2 personal          -0.491
##  5 demotion - zone 4 to zone 2 analysis          -0.433
##  6 demotion - zone 4 to zone 3 maintenance       -0.779
##  7 demotion - zone 4 to zone 3 analysis          -0.611
##  8 demotion - zone 4 to zone 3 manufacturing     -0.505
##  9 demotion - zone 4 to zone 3 media             -0.464
## 10 demotion - zone 4 to zone 3 security          -0.425
## # … with 81 more rows
  df <- all_df %>%
  filter(!is.na(top_major)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_major) %>%
  group_by(top_major) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
  group_by(cluster) %>%
  mutate(cluster_total = n()) %>%
  group_by(cluster, top_major) %>%
  mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
  distinct(top_major, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
  group_by(top_major) %>% mutate(sum = sum(percent))

df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_major))
df <- as.data.frame(df)

df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 93 x 3
## # Groups:   cluster_long [18]
##    cluster_long                top_major              difference
##    <chr>                       <chr>                       <dbl>
##  1 demotion - zone 4 to zone 2 recreation_kinesiology       4.35
##  2 demotion - zone 4 to zone 2 culinary_entertainment       3.64
##  3 demotion - zone 4 to zone 2 cultural_gender              1.58
##  4 demotion - zone 4 to zone 2 engineering_tech             1.56
##  5 demotion - zone 4 to zone 2 language_linguistics         1.53
##  6 demotion - zone 4 to zone 3 library                     13.1 
##  7 demotion - zone 4 to zone 3 leisure_recreation           8.51
##  8 demotion - zone 4 to zone 3 culinary_entertainment       4.65
##  9 demotion - zone 4 to zone 3 military_science             2.57
## 10 demotion - zone 4 to zone 3 architecture                 2.29
## # … with 83 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 90 x 3
## # Groups:   cluster_long [18]
##    cluster_long                top_major             difference
##    <chr>                       <chr>                      <dbl>
##  1 demotion - zone 4 to zone 2 physics                   -0.758
##  2 demotion - zone 4 to zone 2 public_administration     -0.717
##  3 demotion - zone 4 to zone 2 engineering_tech          -0.448
##  4 demotion - zone 4 to zone 2 homeland_security         -0.434
##  5 demotion - zone 4 to zone 2 computer                  -0.369
##  6 demotion - zone 4 to zone 3 psychology                -0.677
##  7 demotion - zone 4 to zone 3 biology                   -0.660
##  8 demotion - zone 4 to zone 3 health_professions        -0.457
##  9 demotion - zone 4 to zone 3 theology_religion         -0.380
## 10 demotion - zone 4 to zone 3 public_administration     -0.311
## # … with 80 more rows
# APPROXIMATE AGE ---------------------------------------------------------------
# Cohort clustering whole career ------------------------------------------------
seq.10 <- create_seq_all_cohort("seq.10", 2000, 2005, after_military = FALSE, left_unemp = FALSE)
seq.10cluster_20 <- cluster("cluster_10", seq.10, sequence_option = "option 1", 20)

cluster_20 <- seq.10cluster_20$cluster$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.10)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)

all_df <- left_join(all_df, covariates, by = "id") %>% mutate(cluster_group = case_when(
  cluster %in% c(2791, 6868, 6884, 6885, 6888) ~ "education",
cluster %in% c(8748, 14400, 14430, 16736) ~ "promotion",
cluster %in% c(8741, 14409, 14414) ~ "demotion",
cluster %in% c(14053) ~ "unemployment",
cluster %in% c(2830, 2831, 6883, 8743, 14435, 16744, 18571) ~ "no change"
),
cluster_label = case_when(
  cluster == 2791 ~ "education to military",
  cluster == 2809 ~ "military",
  cluster == 2830 ~ "military to zone 4",
  cluster == 2831 ~ "education to zone 4",
  cluster == 6862 ~ "zone 4 to education to military/zone 2",
  cluster == 6885 ~ "education to zone 3",
  cluster == 6887 ~ "education to zone 2",
  cluster == 6895 ~ "education to zone 5",
  cluster == 6897 ~ "zone 5 to zone 4",
  cluster == 8763 ~ "zone 5",
  cluster == 8764 ~ "zone 4 to zone 5",
  cluster == 8765 ~ "unemployment to zone 4",
  cluster == 14495 ~ "zone 2 to zone 4",
  cluster == 14510 ~ "zone 4 to zone 3",
  cluster == 14513 ~ "zone 4 to zone 2",
  cluster == 14519 ~ "zone 3 to zone 4",
  cluster == 14520 ~ "zone 4",
  cluster == 16825 ~ "zone 2 to zone 3",
  cluster == 16834 ~ "zone 3",
  cluster == 18637 ~ "zone 2"
), cluster_long = paste0(cluster_group, " - ", cluster_label))
all_df <- all_df %>% left_join(highest_degree_from_forprofit, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_hbcu, by = "id")
all_df <- all_df %>% mutate(irr_bin = ntile(irr, 5))
dataset_top_skills$id <- as.character(dataset_top_skills$id)
all_df <- all_df %>% left_join(dataset_top_skills, by = "id")
dataset_top_majors$id <- as.character(dataset_top_majors$id)
all_df <- all_df %>% left_join(dataset_top_majors, by = "id")
dataset_all_skills$id <- as.character(dataset_all_skills$id)
all_df <- all_df %>% left_join(dataset_all_skills, by = "id")
dataset_all_majors$id <- as.character(dataset_all_majors$id)
all_df <- all_df %>% left_join(dataset_all_majors, by = "id")


all_df <- all_df %>% mutate(
          year_entered_workforce_bin = factor(year_entered_workforce_bin, levels = c("Silent and Greatest", "Baby Boom", "Generation X", "Millennial")),
highest_degree = factor(highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")), 
highest_degree_from_forprofit = factor(highest_degree_from_forprofit, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_hbcu = factor(highest_degree_from_hbcu, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")))

seq.10 <- seq.10[,1:20]
seqdplot(seq.10, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start", axes = c(1,20), 
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "red", "#CBBEB5"))
seqrplot(seq.10, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start", diss = seq.10cluster_20$diss,
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "red", "#CBBEB5"))

cluster_membership_visualization(veteran)
cluster_membership_visualization(highest_degree)
cluster_membership_visualization(gender)
cluster_membership_visualization(irr_bin)
cluster_membership_visualization(highest_degree_from_forprofit)
cluster_membership_visualization(highest_degree_from_hbcu)
cluster_membership_visualization(year_entered_workforce_bin)
cluster_membership_visualization_skillmajor(top_skill)
cluster_membership_visualization_skillmajor(top_major)
cluster_membership_visualization_skillmajor(skill)
cluster_membership_visualization_skillmajor(major)

  df <- all_df %>%
  filter(!is.na(top_skill)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_skill) %>%
  group_by(top_skill) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
  group_by(cluster) %>%
  mutate(cluster_total = n()) %>%
  group_by(cluster, top_skill) %>%
  mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
  distinct(top_skill, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
  group_by(top_skill) %>% mutate(sum = sum(percent))

df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_skill))
df <- as.data.frame(df)

df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_skill, difference)
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_skill, difference)

  df <- all_df %>%
  filter(!is.na(top_major)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_major) %>%
  group_by(top_major) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
  group_by(cluster) %>%
  mutate(cluster_total = n()) %>%
  group_by(cluster, top_major) %>%
  mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
  distinct(top_major, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
  group_by(top_major) %>% mutate(sum = sum(percent))

df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_major))
df <- as.data.frame(df)

df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_major, difference)
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_major, difference)
# Filtering for five years of experience and careers of 10 years ----------------------------------
seq.10 <- create_seq_all_fixed_experience("seq.10", 10, left_unemp = FALSE, 5)
seq.10cluster_20 <- cluster("cluster_10", seq.10, sequence_option = "option 1", 20)

cluster_20 <- seq.10cluster_20$cluster$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.10)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)

all_df <- left_join(all_df, covariates, by = "id") %>% mutate(cluster_group = case_when(
  cluster %in% c(3140, 3148, 3191, 3212) ~ "education",
cluster %in% c(5016, 8681, 8684, 9682) ~ "promotion",
cluster %in% c(5053, 8641) ~ "demotion",
cluster %in% c(8523, 8670, 9664, 10161) ~ "unemployment",
cluster %in% c(5074, 5078, 8310, 8675, 9682, 10156) ~ "no change"
),
cluster_label = case_when(
  cluster == 3140 ~ "education to zone 4",
  cluster == 3148 ~ "education to unemployment",
  cluster == 3191 ~ "zone 4 to education to zone 4",
  cluster == 3212 ~ "education to zone 5",
  cluster == 5016 ~ "zone 4 to zone 5",
  cluster == 5053 ~ "zone 5 to zone 4",
  cluster == 5074 ~ "zone 5",
  cluster == 5078 ~ "zone 4 to zone 5 to zone 4",
  cluster == 8310 ~ "zone 4",
  cluster == 8523 ~ "zone 4 to unemployment to zone 4",
  cluster == 8641 ~ "zone 4 to zone 3",
  cluster == 8670 ~ "zone 4 to unemployment",
  cluster == 8675 ~ "zone 4 to zone 3 to zone 4",
  cluster == 8681 ~ "zone 2 to zone 4",
  cluster == 8684 ~ "zone 3 to zone 4",
  cluster == 9664 ~ "zone 3 to unemployment to zone 3",
  cluster == 9682 ~ "zone 2 to zone 3",
  cluster == 9698 ~ "zone 3",
  cluster == 10156 ~ "zone 2",
  cluster == 10161 ~ "zone 2 to unemployment to zone 2",
), cluster_long = paste0(cluster_group, " - ", cluster_label))
all_df <- all_df %>% left_join(highest_degree_from_forprofit, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_hbcu, by = "id")
all_df <- all_df %>% mutate(irr_bin = ntile(irr, 5))
dataset_top_skills$id <- as.character(dataset_top_skills$id)
all_df <- all_df %>% left_join(dataset_top_skills, by = "id")
dataset_top_majors$id <- as.character(dataset_top_majors$id)
all_df <- all_df %>% left_join(dataset_top_majors, by = "id")
dataset_all_skills$id <- as.character(dataset_all_skills$id)
all_df <- all_df %>% left_join(dataset_all_skills, by = "id")
dataset_all_majors$id <- as.character(dataset_all_majors$id)
all_df <- all_df %>% left_join(dataset_all_majors, by = "id")

all_df <- all_df %>% mutate(
          year_entered_workforce_bin = factor(year_entered_workforce_bin, levels = c("Silent and Greatest", "Baby Boom", "Generation X", "Millennial")),
highest_degree = factor(highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")), 
highest_degree_from_forprofit = factor(highest_degree_from_forprofit, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_hbcu = factor(highest_degree_from_hbcu, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")))

seqdplot(seq.10, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start", axes = c(1,20), 
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5"))
seqrplot(seq.10, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start", diss = seq.10cluster_20$diss,
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5"))

cluster_membership_visualization(veteran)
cluster_membership_visualization(highest_degree)
cluster_membership_visualization(gender)
cluster_membership_visualization(irr_bin)
cluster_membership_visualization(highest_degree_from_forprofit)
cluster_membership_visualization(highest_degree_from_hbcu)
cluster_membership_visualization(year_entered_workforce_bin)
cluster_membership_visualization_skillmajor(top_skill)
cluster_membership_visualization_skillmajor(top_major)
cluster_membership_visualization_skillmajor(skill)
cluster_membership_visualization_skillmajor(major)

  df <- all_df %>%
  filter(!is.na(top_skill)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_skill) %>%
  group_by(top_skill) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
  group_by(cluster) %>%
  mutate(cluster_total = n()) %>%
  group_by(cluster, top_skill) %>%
  mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
  distinct(top_skill, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
  group_by(top_skill) %>% mutate(sum = sum(percent))

df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_skill))
df <- as.data.frame(df)

df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_skill, difference)
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_skill, difference)

  df <- all_df %>%
  filter(!is.na(top_major)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_major) %>%
  group_by(top_major) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
  group_by(cluster) %>%
  mutate(cluster_total = n()) %>%
  group_by(cluster, top_major) %>%
  mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
  distinct(top_major, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
  group_by(top_major) %>% mutate(sum = sum(percent))

df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_major))
df <- as.data.frame(df)

df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_major, difference)
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_major, difference)
# VETERANS -----------------------------------------------------------------------------
# Creating sequence objects for 10, 20, and 30 years after exiting the military for veterans 
seq.10 <- create_seq_vet_fixed("seq.10", 10, left_unemp = FALSE)
seq.20 <- create_seq_vet_fixed("seq.20", 20, left_unemp = FALSE)
seq.30 <- create_seq_vet_fixed("seq.30", 30, left_unemp = FALSE)

# Clustering sequnces
seq.10cluster_20 <- cluster("cluster_10", seq.10, sequence_option = "option 1", 20)
seq.20cluster_20 <- cluster("cluster_10", seq.20, sequence_option = "option 2", 20)
seq.30cluster_10 <- cluster("cluster_10", seq.30, sequence_option = "option 2", 10)
# Visualizing 10 years 20 cluster solution ----------------------
seq.10cluster_20$cluster$stats
##          PBC           HG         HGSD          ASW         ASWw           CH 
##   0.42080980   0.82953931   0.80984445   0.37621229   0.37882230 389.84777555 
##           R2         CHsq         R2sq           HC 
##   0.60628979 727.08985966   0.74174091   0.06945614
cluster_20 <- seq.10cluster_20$cluster$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.10)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)

all_df <- left_join(all_df, covariates, by = "id") %>% mutate(cluster_group = case_when(
  cluster %in% c(781, 1360, 1435, 1511) ~ "education",
cluster %in% c(2607, 4345, 4416, 4690) ~ "promotion",
cluster %in% c(2769, 4409) ~ "demotion",
cluster %in% c(2727, 4303, 4390, 4684) ~ "unemployment",
cluster %in% c(2763, 2772, 4169, 4427, 4705, 4830) ~ "no change"
),
cluster_label = case_when(
  cluster == 781 ~ "zone 5 to education to zone 5",
  cluster == 1360 ~ "zone 4 to education to zone 4",
  cluster == 1435 ~ "education to zone 4",
  cluster == 1511 ~ "zone 4 to education",
  cluster == 2607 ~ "zone 4 to zone 5",
  cluster == 2727 ~ "zone 4 to unemployment to zone 5",
  cluster == 2763 ~ "zone 5",
  cluster == 2769 ~ "zone 5 to zone 4",
  cluster == 2772 ~ "zone 4 to zone 5 zone 4",
  cluster == 4169 ~ "zone 4 to zone 3 to zone 4",
  cluster == 4303 ~ "zone 4 to unemployment to zone 4",
  cluster == 4345 ~ "zone 3 to zone 4",
  cluster == 4390 ~ "zone 4 to unemployment",
  cluster == 4409 ~ "zone 4 to zone 3",
  cluster == 4416 ~ "zone 2 to zone 4",
  cluster == 4427 ~ "zone 4",
  cluster == 4684 ~ "zone 3 to unemployment",
  cluster == 4690 ~ "zone 2 to zone 3",
  cluster == 4705 ~ "zone 3",
  cluster == 4830 ~ "zone 2"
), cluster_long = paste0(cluster_group, " - ", cluster_label))
all_df <- all_df %>% left_join(highest_degree_from_forprofit, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_hbcu, by = "id")
all_df <- all_df %>% mutate(irr_bin = ntile(irr, 5))
dataset_top_skills$id <- as.character(dataset_top_skills$id)
all_df <- all_df %>% left_join(dataset_top_skills, by = "id")
dataset_top_majors$id <- as.character(dataset_top_majors$id)
all_df <- all_df %>% left_join(dataset_top_majors, by = "id")
dataset_all_skills$id <- as.character(dataset_all_skills$id)
all_df <- all_df %>% left_join(dataset_all_skills, by = "id")
dataset_all_majors$id <- as.character(dataset_all_majors$id)
all_df <- all_df %>% left_join(dataset_all_majors, by = "id")

# Plotting
seqdplot(seq.10, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5"))

seqrplot(seq.10, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, with.legend = F, sortv = "from.start", diss = seq.10cluster_20$diss,
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5"))

all_df <- all_df %>% mutate(end_onet55_label = case_when(
  end_onet55 == "55-3011.00" ~ "Air Crew Members",
  end_onet55 == "55-1011.00" ~ "Air Crew Officers",
  end_onet55 == "55-1012.00" ~ "Aircraft Launch and Recovery Officers",
  end_onet55 == "55-3012.00" ~ "Aircraft Launch and Recovery Specialists",
  end_onet55 == "55-3013.00" ~ "Armored Assault Vehicle Crew Members",
  end_onet55 == "55-1013.00" ~ "Armored Assault Vehicle Officers",
  end_onet55 == "55-3014.00" ~ "Artillery and Missile Crew Members",
  end_onet55 == "55-1014.00" ~ "Artillery and Missile Officers",
  end_onet55 == "55-1015.00" ~ "Command and Control Center Officers",
  end_onet55 == "55-3015.00" ~ "Command and Control Center Specialists",
  end_onet55 == "55-2011.00" ~ "First-Line Supervisors of Air Crew Members",
  end_onet55 == "55-2013.00" ~ "First-Line Supervisors of All Other Tactical Operations Specialists",
  end_onet55 == "55-2012.00" ~ "First-Line Supervisors of Weapons Specialists/Crew Members",
  end_onet55 == "55-3016.00" ~ "Infantry",
  end_onet55 == "55-1016.00" ~ "Infantry Officers",
  end_onet55 == "55-3019.00" ~ "Military Enlisted Tactical Operations and Air/Weapons Specialists and Crew Members, All Other",
  end_onet55 == "55-1019.00" ~ "Military Officer Special and Tactical Operations Leaders, All Other",
  end_onet55 == "55-3018.00" ~ "Special Forces",
  end_onet55 == "55-1017.00" ~ "Special Forces Officers",
), years_in_military_bin = factor(years_in_military_bin, levels = c("1 - 2", "3 - 4", "5 - 10", "11 - 20", "21 - 30", "31+")), 
          year_military_enlist_bin = factor(year_military_enlist_bin, levels = c("Vietnam era (1964 to 1974)", "1975 to 1989", "1990 to 2000 (including Persian Gulf War)", "2001 or later")),
          career_before_military_bin = factor(career_before_military_bin, levels = c("0", "1 - 4", "5 - 10", "11 - 20", "21 - 30", "31+")),
          year_entered_workforce_bin = factor(year_entered_workforce_bin, levels = c("Silent and Greatest", "Baby Boom", "Generation X", "Millennial")),
highest_degree = factor(highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")), 
highest_degree_from_forprofit = factor(highest_degree_from_forprofit, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_hbcu = factor(highest_degree_from_hbcu, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")))

cluster_membership_visualization(highest_degree)

cluster_membership_visualization(gender)

cluster_membership_visualization(irr_bin)

cluster_membership_visualization(highest_degree_from_forprofit)

cluster_membership_visualization(highest_degree_from_hbcu)

cluster_membership_visualization(year_entered_workforce_bin)

cluster_membership_visualization(officer)

cluster_membership_visualization(end_onet55_label)

cluster_membership_visualization(years_in_military_bin)

cluster_membership_visualization(year_military_enlist_bin)

cluster_membership_visualization(career_before_military_bin)

cluster_membership_visualization_skillmajor(end_onet55_label)

cluster_membership_visualization_skillmajor(top_skill)

cluster_membership_visualization_skillmajor(top_major)

cluster_membership_visualization_skillmajor(skill)

cluster_membership_visualization_skillmajor(major)

  df <- all_df %>%
  filter(!is.na(top_skill)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_skill) %>%
  group_by(top_skill) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
  group_by(cluster) %>%
  mutate(cluster_total = n()) %>%
  group_by(cluster, top_skill) %>%
  mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
  distinct(top_skill, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
  group_by(top_skill) %>% mutate(sum = sum(percent))

df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_skill))
df <- as.data.frame(df)

df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 100 x 3
## # Groups:   cluster_long [20]
##    cluster_long                top_skill   difference
##    <chr>                       <chr>            <dbl>
##  1 demotion - zone 4 to zone 3 design           2.22 
##  2 demotion - zone 4 to zone 3 analysis         1.49 
##  3 demotion - zone 4 to zone 3 environment      1.19 
##  4 demotion - zone 4 to zone 3 legal            1.03 
##  5 demotion - zone 4 to zone 3 sales            0.727
##  6 demotion - zone 5 to zone 4 economics        2.97 
##  7 demotion - zone 5 to zone 4 media            1.92 
##  8 demotion - zone 5 to zone 4 engineering      1.48 
##  9 demotion - zone 5 to zone 4 analysis         0.806
## 10 demotion - zone 5 to zone 4 business         0.359
## # … with 90 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 100 x 3
## # Groups:   cluster_long [20]
##    cluster_long                top_skill     difference
##    <chr>                       <chr>              <dbl>
##  1 demotion - zone 4 to zone 3 health            -0.569
##  2 demotion - zone 4 to zone 3 security          -0.424
##  3 demotion - zone 4 to zone 3 business          -0.368
##  4 demotion - zone 4 to zone 3 education         -0.353
##  5 demotion - zone 4 to zone 3 engineering       -0.240
##  6 demotion - zone 5 to zone 4 maintenance       -0.884
##  7 demotion - zone 5 to zone 4 manufacturing     -0.712
##  8 demotion - zone 5 to zone 4 architecture      -0.577
##  9 demotion - zone 5 to zone 4 human             -0.538
## 10 demotion - zone 5 to zone 4 finance           -0.304
## # … with 90 more rows
  df <- all_df %>%
  filter(!is.na(top_major)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_major) %>%
  group_by(top_major) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
  group_by(cluster) %>%
  mutate(cluster_total = n()) %>%
  group_by(cluster, top_major) %>%
  mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
  distinct(top_major, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
  group_by(top_major) %>% mutate(sum = sum(percent))

df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_major))
df <- as.data.frame(df)

df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 101 x 3
## # Groups:   cluster_long [20]
##    cluster_long                top_major              difference
##    <chr>                       <chr>                       <dbl>
##  1 demotion - zone 4 to zone 3 law                         6.50 
##  2 demotion - zone 4 to zone 3 engineering_tech            2.05 
##  3 demotion - zone 4 to zone 3 homeland_security           1.36 
##  4 demotion - zone 4 to zone 3 computer                    1.05 
##  5 demotion - zone 4 to zone 3 humanities                  0.963
##  6 demotion - zone 5 to zone 4 agriculture_veterinary      2.61 
##  7 demotion - zone 5 to zone 4 education                   2.61 
##  8 demotion - zone 5 to zone 4 law                         2.28 
##  9 demotion - zone 5 to zone 4 architecture                1.26 
## 10 demotion - zone 5 to zone 4 public_administration       0.935
## # … with 91 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 100 x 3
## # Groups:   cluster_long [20]
##    cluster_long                top_major            difference
##    <chr>                       <chr>                     <dbl>
##  1 demotion - zone 4 to zone 3 engineering              -0.518
##  2 demotion - zone 4 to zone 3 business                 -0.265
##  3 demotion - zone 4 to zone 3 social_sciences           0.461
##  4 demotion - zone 4 to zone 3 psychology                0.666
##  5 demotion - zone 4 to zone 3 humanities                0.963
##  6 demotion - zone 5 to zone 4 humanities               -0.570
##  7 demotion - zone 5 to zone 4 language_linguistics     -0.484
##  8 demotion - zone 5 to zone 4 physics                  -0.366
##  9 demotion - zone 5 to zone 4 engineering_tech         -0.331
## 10 demotion - zone 5 to zone 4 business                 -0.287
## # … with 90 more rows
# Visualizing 20 years 20 cluster solution --------------------------------------------------
seq.20cluster_20$cluster$stats
##          PBC           HG         HGSD          ASW         ASWw           CH 
##   0.53963236   0.87397820   0.87388810   0.21171959   0.22112803  84.55773514 
##           R2         CHsq         R2sq           HC 
##   0.48587626 192.33936296   0.68250695   0.09853612
cluster_20 <- seq.20cluster_20$cluster$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.10)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)

all_df <- left_join(all_df, covariates, by = "id") %>% mutate(cluster_group = case_when(
  cluster %in% c(91, 163, 343, 452) ~ "education",
cluster %in% c(628, 1020, 1071, 1348, 1643) ~ "promotion",
cluster %in% c(576, 1363, 1428, 1690, 1720) ~ "demotion",
cluster %in% c(668, 1078, 1566) ~ "unemployment",
cluster %in% c(1018, 1027, 1627) ~ "no change"
),
cluster_label = case_when(
  cluster == 91 ~ "education to unemployment",
  cluster == 163 ~ "zone 4 to education to zone 4",
  cluster == 343 ~ "education to zone 5",
  cluster == 452 ~ "education to zone 4",
  cluster == 576 ~ "zone 5 to zone 4",
  cluster == 628 ~ "zone 4 to zone 5 (early career)",
  cluster == 668 ~ "zone 4/5 to unemployment to zone 5",
  cluster == 1018 ~ "zone 4 to zone 5 to zone 4",
  cluster == 1020 ~ "zone 4 to zone 5 (late career)",
  cluster == 1027 ~ "zone 5",
  cluster == 1071 ~ "zone 3 to zone 4 (early career)",
  cluster == 1078 ~ "zone 4 to unemployment",
  cluster == 1348 ~ "zone 3 to zone 4 (mid career)",
  cluster == 1363 ~ "zone 4 to zone 3 (mid career)",
  cluster == 1428 ~ "zone 4 to zone 2 (mid career)",
  cluster == 1566 ~ "zone 4 to unemployment to zone 4",
  cluster == 1627 ~ "zone 4",
  cluster == 1643 ~ "zone 2 to zone 3",
  cluster == 1690 ~ "zone 4 to zone 3 (early career)",
  cluster == 1720 ~ "zone 4 to zone 2 (early career)"
), cluster_long = paste0(cluster_group, " - ", cluster_label))
all_df <- all_df %>% left_join(highest_degree_from_forprofit, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_hbcu, by = "id")
all_df <- all_df %>% mutate(irr_bin = ntile(irr, 5))
dataset_top_skills$id <- as.character(dataset_top_skills$id)
all_df <- all_df %>% left_join(dataset_top_skills, by = "id")
dataset_top_majors$id <- as.character(dataset_top_majors$id)
all_df <- all_df %>% left_join(dataset_top_majors, by = "id")
dataset_all_skills$id <- as.character(dataset_all_skills$id)
all_df <- all_df %>% left_join(dataset_all_skills, by = "id")
dataset_all_majors$id <- as.character(dataset_all_majors$id)
all_df <- all_df %>% left_join(dataset_all_majors, by = "id")

# Plotting
seqdplot(seq.20, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5"))

seqrplot(seq.20, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, with.legend = F, sortv = "from.start", diss = seq.20cluster_20$diss,
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5"))

all_df <- all_df %>% mutate(end_onet55_label = case_when(
  end_onet55 == "55-3011.00" ~ "Air Crew Members",
  end_onet55 == "55-1011.00" ~ "Air Crew Officers",
  end_onet55 == "55-1012.00" ~ "Aircraft Launch and Recovery Officers",
  end_onet55 == "55-3012.00" ~ "Aircraft Launch and Recovery Specialists",
  end_onet55 == "55-3013.00" ~ "Armored Assault Vehicle Crew Members",
  end_onet55 == "55-1013.00" ~ "Armored Assault Vehicle Officers",
  end_onet55 == "55-3014.00" ~ "Artillery and Missile Crew Members",
  end_onet55 == "55-1014.00" ~ "Artillery and Missile Officers",
  end_onet55 == "55-1015.00" ~ "Command and Control Center Officers",
  end_onet55 == "55-3015.00" ~ "Command and Control Center Specialists",
  end_onet55 == "55-2011.00" ~ "First-Line Supervisors of Air Crew Members",
  end_onet55 == "55-2013.00" ~ "First-Line Supervisors of All Other Tactical Operations Specialists",
  end_onet55 == "55-2012.00" ~ "First-Line Supervisors of Weapons Specialists/Crew Members",
  end_onet55 == "55-3016.00" ~ "Infantry",
  end_onet55 == "55-1016.00" ~ "Infantry Officers",
  end_onet55 == "55-3019.00" ~ "Military Enlisted Tactical Operations and Air/Weapons Specialists and Crew Members, All Other",
  end_onet55 == "55-1019.00" ~ "Military Officer Special and Tactical Operations Leaders, All Other",
  end_onet55 == "55-3018.00" ~ "Special Forces",
  end_onet55 == "55-1017.00" ~ "Special Forces Officers",
), years_in_military_bin = factor(years_in_military_bin, levels = c("1 - 2", "3 - 4", "5 - 10", "11 - 20", "21 - 30", "31+")), 
          year_military_enlist_bin = factor(year_military_enlist_bin, levels = c("Vietnam era (1964 to 1974)", "1975 to 1989", "1990 to 2000 (including Persian Gulf War)", "2001 or later")),
          career_before_military_bin = factor(career_before_military_bin, levels = c("0", "1 - 4", "5 - 10", "11 - 20", "21 - 30", "31+")),
          year_entered_workforce_bin = factor(year_entered_workforce_bin, levels = c("Silent and Greatest", "Baby Boom", "Generation X", "Millennial")),
highest_degree = factor(highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")), 
highest_degree_from_forprofit = factor(highest_degree_from_forprofit, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_hbcu = factor(highest_degree_from_hbcu, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")))

cluster_membership_visualization(highest_degree)

cluster_membership_visualization(gender)

cluster_membership_visualization(irr_bin)

cluster_membership_visualization(highest_degree_from_forprofit)

cluster_membership_visualization(highest_degree_from_hbcu)

cluster_membership_visualization(year_entered_workforce_bin)

cluster_membership_visualization(officer)

cluster_membership_visualization(end_onet55_label)

cluster_membership_visualization(years_in_military_bin)

cluster_membership_visualization(year_military_enlist_bin)

cluster_membership_visualization(career_before_military_bin)

cluster_membership_visualization_skillmajor(end_onet55_label)

cluster_membership_visualization_skillmajor(top_skill)

cluster_membership_visualization_skillmajor(top_major)

cluster_membership_visualization_skillmajor(skill)

cluster_membership_visualization_skillmajor(major)

  df <- all_df %>%
  filter(!is.na(top_skill)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_skill) %>%
  group_by(top_skill) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
  group_by(cluster) %>%
  mutate(cluster_total = n()) %>%
  group_by(cluster, top_skill) %>%
  mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
  distinct(top_skill, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
  group_by(top_skill) %>% mutate(sum = sum(percent))

df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_skill))
df <- as.data.frame(df)

df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 100 x 3
## # Groups:   cluster_long [20]
##    cluster_long                               top_skill   difference
##    <chr>                                      <chr>            <dbl>
##  1 demotion - zone 4 to zone 2 (early career) personal         4.43 
##  2 demotion - zone 4 to zone 2 (early career) science          0.810
##  3 demotion - zone 4 to zone 2 (early career) legal            0.584
##  4 demotion - zone 4 to zone 2 (early career) security         0.286
##  5 demotion - zone 4 to zone 2 (early career) finance          0.167
##  6 demotion - zone 4 to zone 2 (mid career)   analysis         3.08 
##  7 demotion - zone 4 to zone 2 (mid career)   energy           1.80 
##  8 demotion - zone 4 to zone 2 (mid career)   customer         1.09 
##  9 demotion - zone 4 to zone 2 (mid career)   engineering      0.869
## 10 demotion - zone 4 to zone 2 (mid career)   environment      0.794
## # … with 90 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 100 x 3
## # Groups:   cluster_long [20]
##    cluster_long                               top_skill     difference
##    <chr>                                      <chr>              <dbl>
##  1 demotion - zone 4 to zone 2 (early career) manufacturing     -0.646
##  2 demotion - zone 4 to zone 2 (early career) architecture      -0.480
##  3 demotion - zone 4 to zone 2 (early career) customer          -0.432
##  4 demotion - zone 4 to zone 2 (early career) supply            -0.391
##  5 demotion - zone 4 to zone 2 (early career) maintenance       -0.286
##  6 demotion - zone 4 to zone 2 (mid career)   human             -0.652
##  7 demotion - zone 4 to zone 2 (mid career)   marketing         -0.596
##  8 demotion - zone 4 to zone 2 (mid career)   health            -0.294
##  9 demotion - zone 4 to zone 2 (mid career)   sales             -0.292
## 10 demotion - zone 4 to zone 2 (mid career)   legal             -0.169
## # … with 90 more rows
  df <- all_df %>%
  filter(!is.na(top_major)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_major) %>%
  group_by(top_major) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
  group_by(cluster) %>%
  mutate(cluster_total = n()) %>%
  group_by(cluster, top_major) %>%
  mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
  distinct(top_major, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
  group_by(top_major) %>% mutate(sum = sum(percent))

df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_major))
df <- as.data.frame(df)

df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 101 x 3
## # Groups:   cluster_long [20]
##    cluster_long                               top_major              difference
##    <chr>                                      <chr>                       <dbl>
##  1 demotion - zone 4 to zone 2 (early career) culinary_entertainment       4.73
##  2 demotion - zone 4 to zone 2 (early career) architecture                 4.73
##  3 demotion - zone 4 to zone 2 (early career) theology_religion            4.29
##  4 demotion - zone 4 to zone 2 (early career) education                    3.58
##  5 demotion - zone 4 to zone 2 (early career) family_consumer              3.58
##  6 demotion - zone 4 to zone 2 (mid career)   recreation_kinesiology       2.99
##  7 demotion - zone 4 to zone 2 (mid career)   law                          2.63
##  8 demotion - zone 4 to zone 2 (mid career)   homeland_security            2.42
##  9 demotion - zone 4 to zone 2 (mid career)   language_linguistics         2.42
## 10 demotion - zone 4 to zone 2 (mid career)   public_administration        1.85
## # … with 91 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 101 x 3
## # Groups:   cluster_long [20]
##    cluster_long                               top_major             difference
##    <chr>                                      <chr>                      <dbl>
##  1 demotion - zone 4 to zone 2 (early career) public_administration     -0.591
##  2 demotion - zone 4 to zone 2 (early career) communication             -0.316
##  3 demotion - zone 4 to zone 2 (early career) business                  -0.241
##  4 demotion - zone 4 to zone 2 (early career) engineering_tech          -0.152
##  5 demotion - zone 4 to zone 2 (early career) computer                  -0.103
##  6 demotion - zone 4 to zone 2 (mid career)   engineering               -0.533
##  7 demotion - zone 4 to zone 2 (mid career)   computer                  -0.432
##  8 demotion - zone 4 to zone 2 (mid career)   communication             -0.404
##  9 demotion - zone 4 to zone 2 (mid career)   physics                   -0.300
## 10 demotion - zone 4 to zone 2 (mid career)   engineering_tech          -0.261
## # … with 91 more rows
# Visualizing 30 years 10 cluster solution -------------------------------------------------
seq.30cluster_10$cluster$stats
##        PBC         HG       HGSD        ASW       ASWw         CH         R2 
##  0.5528748  0.8025530  0.8025144  0.2035540  0.2280479 21.4122729  0.3788215 
##       CHsq       R2sq         HC 
## 47.5151041  0.5750609  0.1226446
cluster_20 <- seq.30cluster_10$cluster$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.30)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)

all_df <- left_join(all_df, covariates, by = "id") %>% mutate(cluster_group = case_when(
  cluster %in% c(3, 40, 122) ~ "education",
cluster %in% c(135, 325) ~ "promotion",
cluster %in% c() ~ "demotion",
cluster %in% c(302, 303) ~ "unemployment",
cluster %in% c(186, 238, 314) ~ "no change",
),
cluster_label = case_when(
  cluster == 3 ~ "zone 4 to education to zone 4 and unemmployment",
  cluster == 40 ~ "zone 4 to education to zone 4",
  cluster == 122 ~ "zone 4 to long unemployment to education to zone 5",
  cluster == 135 ~ "zone 4 to zone 5 to zone 4",
  cluster == 186 ~ "zone 5",
  cluster == 238 ~ "zone 3",
  cluster == 302 ~ "zone 4 to zone 3 to unemployment",
  cluster == 303 ~ "zone 4 to unemployment to zone 4",
  cluster == 314 ~ "zone 4",
  cluster == 325 ~ "zone 2 to zone 5 to zone 4"
), cluster_long = paste0(cluster_group, " - ", cluster_label))
all_df <- all_df %>% left_join(highest_degree_from_forprofit, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_hbcu, by = "id")
all_df <- all_df %>% mutate(irr_bin = ntile(irr, 5))
dataset_top_skills$id <- as.character(dataset_top_skills$id)
all_df <- all_df %>% left_join(dataset_top_skills, by = "id")
dataset_top_majors$id <- as.character(dataset_top_majors$id)
all_df <- all_df %>% left_join(dataset_top_majors, by = "id")
dataset_all_skills$id <- as.character(dataset_all_skills$id)
all_df <- all_df %>% left_join(dataset_all_skills, by = "id")
dataset_all_majors$id <- as.character(dataset_all_majors$id)
all_df <- all_df %>% left_join(dataset_all_majors, by = "id")

# Plotting
seqdplot(seq.30, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5", "dark gray"))

seqrplot(seq.30, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, with.legend = F, sortv = "from.start", diss = seq.30cluster_10$diss,
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5"))

all_df <- all_df %>% mutate(end_onet55_label = case_when(
  end_onet55 == "55-3011.00" ~ "Air Crew Members",
  end_onet55 == "55-1011.00" ~ "Air Crew Officers",
  end_onet55 == "55-1012.00" ~ "Aircraft Launch and Recovery Officers",
  end_onet55 == "55-3012.00" ~ "Aircraft Launch and Recovery Specialists",
  end_onet55 == "55-3013.00" ~ "Armored Assault Vehicle Crew Members",
  end_onet55 == "55-1013.00" ~ "Armored Assault Vehicle Officers",
  end_onet55 == "55-3014.00" ~ "Artillery and Missile Crew Members",
  end_onet55 == "55-1014.00" ~ "Artillery and Missile Officers",
  end_onet55 == "55-1015.00" ~ "Command and Control Center Officers",
  end_onet55 == "55-3015.00" ~ "Command and Control Center Specialists",
  end_onet55 == "55-2011.00" ~ "First-Line Supervisors of Air Crew Members",
  end_onet55 == "55-2013.00" ~ "First-Line Supervisors of All Other Tactical Operations Specialists",
  end_onet55 == "55-2012.00" ~ "First-Line Supervisors of Weapons Specialists/Crew Members",
  end_onet55 == "55-3016.00" ~ "Infantry",
  end_onet55 == "55-1016.00" ~ "Infantry Officers",
  end_onet55 == "55-3019.00" ~ "Military Enlisted Tactical Operations and Air/Weapons Specialists and Crew Members, All Other",
  end_onet55 == "55-1019.00" ~ "Military Officer Special and Tactical Operations Leaders, All Other",
  end_onet55 == "55-3018.00" ~ "Special Forces",
  end_onet55 == "55-1017.00" ~ "Special Forces Officers",
), years_in_military_bin = factor(years_in_military_bin, levels = c("1 - 2", "3 - 4", "5 - 10", "11 - 20", "21 - 30", "31+")), 
          year_military_enlist_bin = factor(year_military_enlist_bin, levels = c("Vietnam era (1964 to 1974)", "1975 to 1989", "1990 to 2000 (including Persian Gulf War)", "2001 or later")),
          career_before_military_bin = factor(career_before_military_bin, levels = c("0", "1 - 4", "5 - 10", "11 - 20", "21 - 30", "31+")),
          year_entered_workforce_bin = factor(year_entered_workforce_bin, levels = c("Silent and Greatest", "Baby Boom", "Generation X", "Millennial")),
highest_degree = factor(highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")), 
highest_degree_from_forprofit = factor(highest_degree_from_forprofit, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")),
highest_degree_from_hbcu = factor(highest_degree_from_hbcu, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd")))
cluster_membership_visualization(highest_degree)

cluster_membership_visualization(gender)

cluster_membership_visualization(irr_bin)

cluster_membership_visualization(highest_degree_from_forprofit)

cluster_membership_visualization(highest_degree_from_hbcu)

cluster_membership_visualization(year_entered_workforce_bin)

cluster_membership_visualization(officer)

cluster_membership_visualization(end_onet55_label)

cluster_membership_visualization(years_in_military_bin)

cluster_membership_visualization(year_military_enlist_bin)

cluster_membership_visualization(career_before_military_bin)

cluster_membership_visualization_skillmajor(end_onet55_label)

cluster_membership_visualization_skillmajor(top_skill)

cluster_membership_visualization_skillmajor(top_major)

cluster_membership_visualization_skillmajor(skill)

cluster_membership_visualization_skillmajor(major)

  df <- all_df %>%
  filter(!is.na(top_skill)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_skill) %>%
  group_by(top_skill) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
  group_by(cluster) %>%
  mutate(cluster_total = n()) %>%
  group_by(cluster, top_skill) %>%
  mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
  distinct(top_skill, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
  group_by(top_skill) %>% mutate(sum = sum(percent))

df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_skill))
df <- as.data.frame(df)

df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 50 x 3
## # Groups:   cluster_long [10]
##    cluster_long                                            top_skill  difference
##    <chr>                                                   <chr>           <dbl>
##  1 education - zone 4 to education to zone 4               design        9.03   
##  2 education - zone 4 to education to zone 4               human         1.51   
##  3 education - zone 4 to education to zone 4               informati…    0.527  
##  4 education - zone 4 to education to zone 4               engineeri…    0.114  
##  5 education - zone 4 to education to zone 4               education     0.00263
##  6 education - zone 4 to education to zone 4 and unemmplo… human         3.88   
##  7 education - zone 4 to education to zone 4 and unemmplo… health        1.44   
##  8 education - zone 4 to education to zone 4 and unemmplo… finance       0.628  
##  9 education - zone 4 to education to zone 4 and unemmplo… informati…    0.553  
## 10 education - zone 4 to education to zone 4 and unemmplo… business     -0.542  
## # … with 40 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_skill, difference)
## # A tibble: 50 x 3
## # Groups:   cluster_long [10]
##    cluster_long                                            top_skill  difference
##    <chr>                                                   <chr>           <dbl>
##  1 education - zone 4 to education to zone 4               finance       -0.443 
##  2 education - zone 4 to education to zone 4               sales         -0.410 
##  3 education - zone 4 to education to zone 4               business      -0.373 
##  4 education - zone 4 to education to zone 4               security      -0.0885
##  5 education - zone 4 to education to zone 4               marketing     -0.0885
##  6 education - zone 4 to education to zone 4 and unemmplo… business      -0.542 
##  7 education - zone 4 to education to zone 4 and unemmplo… informati…     0.553 
##  8 education - zone 4 to education to zone 4 and unemmplo… finance        0.628 
##  9 education - zone 4 to education to zone 4 and unemmplo… health         1.44  
## 10 education - zone 4 to education to zone 4 and unemmplo… human          3.88  
## # … with 40 more rows
  df <- all_df %>%
  filter(!is.na(top_major)) %>% distinct(id, cluster, cluster_long, cluster_group, cluster_label, top_major) %>%
  group_by(top_major) %>% mutate(variable_total = n()) %>% ungroup() %>% mutate(variable_percent = variable_total/n() * 100) %>%
  group_by(cluster) %>%
  mutate(cluster_total = n()) %>%
  group_by(cluster, top_major) %>%
  mutate(percent = n()/cluster_total * 100, difference = (percent - variable_percent)/variable_percent) %>%
  distinct(top_major, cluster_long, cluster_group, cluster_label, percent, variable_percent, difference) %>%
  group_by(top_major) %>% mutate(sum = sum(percent))

df$cluster <- as.factor(df$cluster)
df <- df %>% filter(!is.na(top_major))
df <- as.data.frame(df)

df %>% group_by(cluster_long) %>% slice_max(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 44 x 3
## # Groups:   cluster_long [10]
##    cluster_long                                    top_major          difference
##    <chr>                                           <chr>                   <dbl>
##  1 education - zone 4 to education to zone 4       physics                 4.69 
##  2 education - zone 4 to education to zone 4       philosophy_religi…      4.69 
##  3 education - zone 4 to education to zone 4       culinary_entertai…      4.69 
##  4 education - zone 4 to education to zone 4       health_professions      4.69 
##  5 education - zone 4 to education to zone 4       engineering_tech        1.84 
##  6 education - zone 4 to education to zone 4 and … communication           3.33 
##  7 education - zone 4 to education to zone 4 and … engineering             0.784
##  8 education - zone 4 to education to zone 4 and … social_sciences         0.569
##  9 education - zone 4 to education to zone 4 and … public_administra…      0.517
## 10 education - zone 4 to education to zone 4 and … computer               -0.278
## # … with 34 more rows
df %>% group_by(cluster_long) %>% slice_min(difference, n = 5) %>% select(cluster_long, top_major, difference)
## # A tibble: 44 x 3
## # Groups:   cluster_long [10]
##    cluster_long                                     top_major         difference
##    <chr>                                            <chr>                  <dbl>
##  1 education - zone 4 to education to zone 4        social_sciences      -0.608 
##  2 education - zone 4 to education to zone 4        engineering          -0.498 
##  3 education - zone 4 to education to zone 4        psychology           -0.368 
##  4 education - zone 4 to education to zone 4        language_linguis…    -0.0521
##  5 education - zone 4 to education to zone 4        business              0.185 
##  6 education - zone 4 to education to zone 4 and u… business             -0.684 
##  7 education - zone 4 to education to zone 4 and u… computer             -0.278 
##  8 education - zone 4 to education to zone 4 and u… public_administr…     0.517 
##  9 education - zone 4 to education to zone 4 and u… social_sciences       0.569 
## 10 education - zone 4 to education to zone 4 and u… engineering           0.784 
## # … with 34 more rows
library(RColorBrewer)

  overall <- all_df %>% filter(!is.na(top_skill)) %>% mutate(total = n()) %>% group_by(cluster) %>% mutate(overall_cluster_count = n()) %>% group_by(top_skill) %>% mutate(overall_covariate_percent = round((n()/total * 100), 2)) %>% distinct(top_skill, overall_covariate_percent) %>% mutate(cluster_label = "overall", cluster_group = "overall", overall_cluster_percent = .1) %>% rename(cluster_covariate_percent = overall_covariate_percent)
nb.cols <- length(unique(all_df$top_skill))
mycolors <- colorRampPalette(brewer.pal(8, "Set2"))(nb.cols)
all_df %>% filter(!is.na(top_skill)) %>% mutate(total = n()) %>% group_by(cluster) %>% mutate(overall_cluster_count = n(), overall_cluster_percent = n()/total) %>% group_by(top_skill) %>% mutate(overall_covariate_percent = round((n()/total * 100), 2)) %>% group_by(cluster, top_skill) %>% mutate(cluster_covariate_percent = round((n()/overall_cluster_count * 100), 2)) %>% distinct(cluster, cluster_label, cluster_group, top_skill, cluster_covariate_percent, overall_cluster_percent) %>% group_by(top_skill) %>% left_join(select(overall, overall = cluster_covariate_percent), by = "top_skill") %>% rbind(overall) %>% mutate(alpha = ifelse(!is.na(overall), abs((overall - cluster_covariate_percent)/overall), max(abs((overall - cluster_covariate_percent)/overall)))) %>%
  ggplot(aes(cluster_covariate_percent, cluster_label, fill = factor(top_skill), alpha = factor(alpha))) +
  geom_col(aes(width = overall_cluster_percent*10)) +
  facet_wrap(~ factor(cluster_group, levels = c("overall", "education", "promotion", "no change", "demotion", "unemployment")), ncol = 1, scales="free") + scale_fill_manual(values = mycolors) +
  scale_alpha_discrete(guide = "none")

#cluster_membership_visualization(skill)
#cluster_membership_visualization(major)

top_skill <- all_df %>%
  filter(!is.na(top_skill)) %>%
  group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(top_skill) %>%
  mutate(total = n()) %>%
  group_by(cluster, top_skill) %>%
  mutate(percent = n()/total * 100, difference = (percent - cluster_percent)/percent) %>%
  distinct(top_skill, cluster_long, cluster_group, cluster_label, percent, cluster_percent, difference) %>%
  group_by(top_skill) %>% mutate(sum = sum(percent))

top_skill$cluster <- as.factor(top_skill$cluster)
top_skill <- top_skill %>% filter(!is.na(top_skill))
top_skill <- as.data.frame(top_skill)

ggplot(top_skill, aes(cluster_label, top_skill, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0, limits = c(-10, 10))+
  theme_classic()+
  labs(title = "Cluster membership by top_skill") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_grid(~cluster_group, switch = "x", scales = "free_x", space = "free_x")

top_major <- all_df %>%
  filter(!is.na(top_major)) %>%
  group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(top_major) %>%
  mutate(total = n()) %>%
  group_by(cluster, top_major) %>%
  mutate(percent = n()/total * 100, difference = (percent - cluster_percent)/percent) %>%
  distinct(top_major, cluster_long, cluster_group, cluster_label, percent, cluster_percent, difference) %>%
  group_by(top_major) %>% mutate(sum = sum(percent))

top_major$cluster <- as.factor(top_major$cluster)
top_major <- top_major %>% filter(!is.na(top_major))
top_major <- as.data.frame(top_major)

ggplot(top_major, aes(cluster_label, top_major, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0, limits = c(-10, 10))+
 theme_classic()+
  labs(title = "Cluster membership by top_major") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_grid(~cluster_group, switch = "x", scales = "free_x", space = "free_x")

#install.packages("ggpubr", dependencies = TRUE)
#ggballoonplot(aes(x = all_df$cluster, y = all_df%top_skill))

#install.packages("FactoMineR", dependencies = TRUE)
#install.packages("factoextra", dependencies = TRUE)
#library(FactoMineR)
library(ca)
test <- all_df %>% select(cluster_label, cluster_group, top_skill) %>% mutate(value = 1)
table <- dcast(test, cluster_label ~ top_skill, fun.aggregate = sum)
table <- table %>% remove_rownames %>% column_to_rownames(var = "cluster_label")
#as.table(table)
plot(ca(table))

test <- all_df %>% select(cluster_label, cluster_group, top_major) %>% mutate(value = 1)
table <- dcast(test, cluster_label ~ top_major, fun.aggregate = sum)
table <- table %>% remove_rownames %>% column_to_rownames(var = "cluster_label")
#as.table(table)
plot(ca(table))


all_df %>% select(cluster_label, cluster_group, top_skill) %>% mutate(count = 1, total = n()) %>% group_by(top_skill) %>% mutate(skill_total = sum(count)) %>% ungroup() %>% group_by(cluster_label) %>% mutate(cluster_total = sum(count)) %>% group_by(cluster_label, top_skill) %>% mutate(value = sum(count)/cluster_total) %>%
ggplot(aes(x = cluster_label, y = top_skill)) +
  geom_point(aes(size=value),shape=21, colour="black", fill="skyblue")+
  theme(panel.background=element_blank(), panel.border = element_rect(colour = "blue", fill=NA, size=1))+
  scale_size_area(max_size=20)+
  facet_grid(~cluster_group, switch = "x", scales = "free_x", space = "free_x") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

cluster_membership_visualization_skillmajor(skill)


all_df %>% filter(!is.na(highest_degree), highest_degree != "post-bacc") %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(highest_degree) %>%
  mutate(total = n()) %>%
  group_by(cluster, highest_degree) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(highest_degree, cluster, percent, cluster_percent, difference) %>%
  group_by(highest_degree) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, highest_degree, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by highest degree")
all_df %>% group_by(cluster) %>% filter(!is.na(gender)) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(gender) %>%
  mutate(total = n()) %>%
  group_by(cluster, gender) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(gender, cluster, percent, cluster_percent, difference) %>%
  group_by(gender) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, gender, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by gender")
all_df %>% group_by(cluster) %>% filter(highest_degree_from_forprofit != "highschool") %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(highest_degree_from_forprofit) %>%
  mutate(total = n()) %>%
  group_by(cluster, highest_degree_from_forprofit) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(highest_degree_from_forprofit, cluster, percent, cluster_percent, difference) %>%
  group_by(highest_degree_from_forprofit) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, highest_degree_from_forprofit, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by highest degree from forprofit")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(highest_degree_from_hbcu) %>%
  mutate(total = n()) %>%
  group_by(cluster, highest_degree_from_hbcu) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(highest_degree_from_hbcu, cluster, percent, cluster_percent, difference) %>%
  group_by(highest_degree_from_hbcu) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, highest_degree_from_hbcu, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by highest degree from hbcu")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(irr_bin) %>%
  mutate(total = n()) %>%
  group_by(cluster, irr_bin) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(irr_bin, cluster, percent, cluster_percent, difference) %>%
  group_by(irr_bin) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, irr_bin, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by IRR")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(officer) %>%
  mutate(total = n()) %>%
  group_by(cluster, officer) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(officer, cluster, percent, cluster_percent, difference) %>%
  group_by(officer) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, officer, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by officer")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(end_onet55) %>%
  mutate(total = n()) %>% filter(total > 10) %>%
  group_by(cluster, end_onet55) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(end_onet55, cluster, percent, cluster_percent, difference) %>%
  group_by(end_onet55) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, end_onet55, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by ending onet 55")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(year_military_enlist_bin) %>%
  mutate(total = n()) %>%
  group_by(cluster, year_military_enlist_bin) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(cluster, year_military_enlist_bin, percent, cluster_percent, difference) %>%
  group_by(year_military_enlist_bin) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, year_military_enlist_bin, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by year military enlist")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(year_entered_workforce_bin) %>%
  mutate(total = n()) %>%
  group_by(cluster, year_entered_workforce_bin) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(cluster, year_entered_workforce_bin, percent, cluster_percent, difference) %>%
  group_by(year_entered_workforce_bin) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, year_entered_workforce_bin, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by year entered the workforce")

# Visualizing 10 years 20 cluster solution
seq.20cluster_20$stats

cluster_20 <- seq.20cluster_20$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.10)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)

all_df <- left_join(all_df, covariates, by = "id")

veteran <- all_df %>%
  filter(!is.na(veteran)) %>%
  group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(veteran) %>%
  mutate(total = n()) %>%
  group_by(cluster, veteran) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(veteran, cluster, percent, cluster_percent, difference) %>%
  group_by(veteran) %>% mutate(sum = sum(percent))

veteran$cluster <- as.factor(veteran$cluster)
veteran <- veteran %>% filter(!is.na(veteran))
veteran <- as.data.frame(veteran)

seqdplot(seq.20, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5", "dark gray"))
ggplot(veteran, aes(cluster, veteran, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Clustering membership by veteran")

# Visualizing 30 years 20 cluster solution
seq.30cluster_5$stats

cluster_5 <- seq.30cluster_5$clustering
cluster_5_fac <- factor(cluster_5)
all <- rownames(seq.30)
all_df <- as.data.frame(cbind(all, cluster_5))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)

all_df <- left_join(all_df, covariates, by = "id")

veteran <- all_df %>%
  filter(!is.na(veteran)) %>%
  group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(veteran) %>%
  mutate(total = n()) %>%
  group_by(cluster, veteran) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(veteran, cluster, percent, cluster_percent, difference) %>%
  group_by(veteran) %>% mutate(sum = sum(percent))

veteran$cluster <- as.factor(veteran$cluster)
veteran <- veteran %>% filter(!is.na(veteran))
veteran <- as.data.frame(veteran)

seqdplot(seq.30, group = cluster_5_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5", "dark gray"))
ggplot(veteran, aes(cluster, veteran, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Clustering membership by veteran")
all_df %>% filter(!is.na(highest_degree), highest_degree != "post-bacc") %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(highest_degree) %>%
  mutate(total = n()) %>%
  group_by(cluster, highest_degree) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(highest_degree, cluster, percent, cluster_percent, difference) %>%
  group_by(highest_degree) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, highest_degree, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by highest degree")
all_df %>% group_by(cluster) %>% filter(!is.na(gender)) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(gender) %>%
  mutate(total = n()) %>%
  group_by(cluster, gender) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(gender, cluster, percent, cluster_percent, difference) %>%
  group_by(gender) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, gender, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by gender")
all_df %>% group_by(cluster) %>% filter(highest_degree_from_forprofit != "highschool") %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(highest_degree_from_forprofit) %>%
  mutate(total = n()) %>%
  group_by(cluster, highest_degree_from_forprofit) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(highest_degree_from_forprofit, cluster, percent, cluster_percent, difference) %>%
  group_by(highest_degree_from_forprofit) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, highest_degree_from_forprofit, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by highest degree from forprofit")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(highest_degree_from_hbcu) %>%
  mutate(total = n()) %>%
  group_by(cluster, highest_degree_from_hbcu) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(highest_degree_from_hbcu, cluster, percent, cluster_percent, difference) %>%
  group_by(highest_degree_from_hbcu) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, highest_degree_from_hbcu, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by highest degree from hbcu")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(irr_bin) %>%
  mutate(total = n()) %>%
  group_by(cluster, irr_bin) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(irr_bin, cluster, percent, cluster_percent, difference) %>%
  group_by(irr_bin) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, irr_bin, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by IRR")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(officer) %>%
  mutate(total = n()) %>%
  group_by(cluster, officer) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(officer, cluster, percent, cluster_percent, difference) %>%
  group_by(officer) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, officer, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by officer")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(end_onet55) %>%
  mutate(total = n()) %>% filter(total > 10) %>%
  group_by(cluster, end_onet55) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(end_onet55, cluster, percent, cluster_percent, difference) %>%
  group_by(end_onet55) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, end_onet55, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by ending onet 55")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(year_military_enlist_bin) %>%
  mutate(total = n()) %>%
  group_by(cluster, year_military_enlist_bin) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(cluster, year_military_enlist_bin, percent, cluster_percent, difference) %>%
  group_by(year_military_enlist_bin) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, year_military_enlist_bin, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by year military enlist")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(year_entered_workforce_bin) %>%
  mutate(total = n()) %>%
  group_by(cluster, year_entered_workforce_bin) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(cluster, year_entered_workforce_bin, percent, cluster_percent, difference) %>%
  group_by(year_entered_workforce_bin) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, year_entered_workforce_bin, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by year entered the workforce")

# Visualizing 20 years 20 cluster solution
seq.20cluster_20$stats

cluster_20 <- seq.20cluster_20$clustering
cluster_20_fac <- factor(cluster_20)
all <- rownames(seq.20)
all_df <- as.data.frame(cbind(all, cluster_20))
colnames(all_df) <- c("id", "cluster")
covariates <- bg_covariates
covariates$id <- as.character(covariates$id)

all_df <- left_join(all_df, covariates, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_forprofit, by = "id")
all_df <- all_df %>% left_join(highest_degree_from_hbcu, by = "id")
all_df <- all_df %>% mutate(irr_bin = ntile(irr, 5))

# Plotting
seqdplot(seq.20, group = cluster_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5", "dark gray"))

all_df %>% filter(!is.na(highest_degree), highest_degree != "post-bacc") %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(highest_degree) %>%
  mutate(total = n()) %>%
  group_by(cluster, highest_degree) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(highest_degree, cluster, percent, cluster_percent, difference) %>%
  group_by(highest_degree) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, highest_degree, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by highest degree")
all_df %>% group_by(cluster) %>% filter(!is.na(gender)) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(gender) %>%
  mutate(total = n()) %>%
  group_by(cluster, gender) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(gender, cluster, percent, cluster_percent, difference) %>%
  group_by(gender) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, gender, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by gender")
all_df %>% group_by(cluster) %>% filter(highest_degree_from_forprofit != "highschool") %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(highest_degree_from_forprofit) %>%
  mutate(total = n()) %>%
  group_by(cluster, highest_degree_from_forprofit) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(highest_degree_from_forprofit, cluster, percent, cluster_percent, difference) %>%
  group_by(highest_degree_from_forprofit) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, highest_degree_from_forprofit, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by highest degree from forprofit")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(highest_degree_from_hbcu) %>%
  mutate(total = n()) %>%
  group_by(cluster, highest_degree_from_hbcu) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(highest_degree_from_hbcu, cluster, percent, cluster_percent, difference) %>%
  group_by(highest_degree_from_hbcu) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, highest_degree_from_hbcu, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by highest degree from hbcu")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(irr_bin) %>%
  mutate(total = n()) %>%
  group_by(cluster, irr_bin) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(irr_bin, cluster, percent, cluster_percent, difference) %>%
  group_by(irr_bin) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, irr_bin, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by IRR")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(officer) %>%
  mutate(total = n()) %>%
  group_by(cluster, officer) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(officer, cluster, percent, cluster_percent, difference) %>%
  group_by(officer) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, officer, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by officer")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(end_onet55) %>%
  mutate(total = n()) %>% filter(total > 10) %>%
  group_by(cluster, end_onet55) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(end_onet55, cluster, percent, cluster_percent, difference) %>%
  group_by(end_onet55) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, end_onet55, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by ending onet 55")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(year_military_enlist_bin) %>%
  mutate(total = n()) %>%
  group_by(cluster, year_military_enlist_bin) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(cluster, year_military_enlist_bin, percent, cluster_percent, difference) %>%
  group_by(year_military_enlist_bin) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, year_military_enlist_bin, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by year military enlist")
all_df %>% group_by(cluster) %>% mutate(cluster_total = n()) %>% ungroup() %>% mutate(cluster_percent = cluster_total/n() * 100) %>%
  group_by(year_entered_workforce_bin) %>%
  mutate(total = n()) %>%
  group_by(cluster, year_entered_workforce_bin) %>%
  mutate(percent = n()/total * 100, difference = percent - cluster_percent) %>%
  distinct(cluster, year_entered_workforce_bin, percent, cluster_percent, difference) %>%
  group_by(year_entered_workforce_bin) %>% mutate(sum = sum(percent)) %>%
ggplot(aes(cluster, year_entered_workforce_bin, fill= difference)) + 
  geom_tile()+
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)+
  theme_classic()+
  labs(title = "Cluster membership by year entered the workforce")
vet.seq <- create_seq_vet_fixed("vet.seq", 10, left_unemp = TRUE)

transition_matrix <- seqtrate(vet.seq, weighted=FALSE, count=TRUE)
round(transition_matrix,2)
diag(transition_matrix) = 0
round(transition_matrix,2)
rowsum <- apply(transition_matrix, 1, sum)
for(i in 1:nrow(transition_matrix)) {
  for (j in 1:ncol(transition_matrix)){
    transition_matrix[i,j] = transition_matrix[i,j]/rowsum[i]
  }
}
for(j in 1:ncol(transition_matrix)){
      transition_matrix[7,j] = transition_matrix[7,j] = 0
}
transition_matrix <- round(transition_matrix,2)
cost_matrix <- 2-transition_matrix
round(cost_matrix,2)
diag(cost_matrix) = 0
cost_matrix <- round(cost_matrix,2)

# OPTION 1 : OUR OM SPELL, LOW EXPANSION COST AND LOW WEIGHT
sequencing.opt1 <- seqdist(vet.seq, method = "OMspell", indel = "auto", sm = cost_matrix, with.missing = TRUE, otto = 0, expcost = 0)

# OPTION 2 : OUR OM
sequencing.opt2 <- seqdist(vet.seq, method = "OMstran", indel = "auto", sm = cost_matrix, with.missing = TRUE, otto = 0.5)

# OPTION 3 : OUR OM, LOW WEIGHT ON ORIGIN
sequencing.opt3 <- seqdist(vet.seq, method = "OMstran", indel = 1, sm = cost_matrix, with.missing = TRUE, otto = 0.01)

# CLUSTERING
numbers <- (2:20)
pamstats <- NULL
#rownames(pamstats) <- numbers
pamstats <- NULL
for (number in numbers) {
  clusterpam <- wcKMedoids(sequencing.opt1, k = number)
  assign(paste0("sequencing.pam.opt1.", number), clusterpam, envir = .GlobalEnv)
  pamstats[[number]] <- clusterpam$stats
}
pamstats.opt1 <- dplyr::bind_rows(pamstats)
pamstats <- NULL
for (number in numbers) {
  clusterpam <- wcKMedoids(sequencing.opt2, k = number)
  assign(paste0("sequencing.pam.opt2.", number), clusterpam, envir = .GlobalEnv)
  pamstats[[number]] <- clusterpam$stats
}
pamstats.opt2 <- dplyr::bind_rows(pamstats)
for (number in numbers) {
  clusterpam <- wcKMedoids(sequencing.opt3, k = number)
  assign(paste0("sequencing.pam.opt3.", number), clusterpam, envir = .GlobalEnv)
  pamstats[[number]] <- clusterpam$stats
}
pamstats.opt3 <- dplyr::bind_rows(pamstats)

# QUALITY METRICS
rownames(pamstats.opt1) <- numbers
PAM.ASW.Opt1 <- as.matrix(pamstats.opt1$ASW)
colnames(PAM.ASW.Opt1) <- c("ASW opt1")
PAM.ASW.Opt2 <- as.matrix(pamstats.opt2$ASW)
colnames(PAM.ASW.Opt2) <- c("ASW opt2")
PAM.ASW.Opt3 <- as.matrix(pamstats.opt3$ASW)
colnames(PAM.ASW.Opt3) <- c("ASW opt3")
PAM.ASW <- bind_cols(PAM.ASW.Opt1, PAM.ASW.Opt2, PAM.ASW.Opt3)
PAM.ASW <- PAM.ASW %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.ASW$rowname <- as.numeric(PAM.ASW$rowname)

rownames(pamstats.opt1) <- numbers
PAM.HC.Opt1 <- as.matrix(pamstats.opt1$HC)
colnames(PAM.HC.Opt1) <- c("HC opt1")
PAM.HC.Opt2 <- as.matrix(pamstats.opt2$HC)
colnames(PAM.HC.Opt2) <- c("HC opt2")
PAM.HC.Opt3 <- as.matrix(pamstats.opt3$HC)
colnames(PAM.HC.Opt3) <- c("HC opt3")
PAM.HC <- bind_cols(PAM.HC.Opt1, PAM.HC.Opt2, PAM.HC.Opt3)
PAM.HC <- PAM.HC %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.HC$rowname <- as.numeric(PAM.HC$rowname)

rownames(pamstats.opt1) <- numbers
PAM.PBC.Opt1 <- as.matrix(pamstats.opt1$PBC)
colnames(PAM.PBC.Opt1) <- c("PBC opt1")
PAM.PBC.Opt2 <- as.matrix(pamstats.opt2$PBC)
colnames(PAM.PBC.Opt2) <- c("PBC opt2")
PAM.PBC.Opt3 <- as.matrix(pamstats.opt3$PBC)
colnames(PAM.PBC.Opt3) <- c("PBC opt3")
PAM.PBC <- bind_cols(PAM.PBC.Opt1, PAM.PBC.Opt2, PAM.PBC.Opt3)
PAM.PBC <- PAM.PBC %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.PBC$rowname <- as.numeric(PAM.PBC$rowname)

rownames(pamstats.opt1) <- numbers
PAM.R2.Opt1 <- as.matrix(pamstats.opt1$R2)
colnames(PAM.R2.Opt1) <- c("R2 opt1")
PAM.R2.Opt2 <- as.matrix(pamstats.opt2$R2)
colnames(PAM.R2.Opt2) <- c("R2 opt2")
PAM.R2.Opt3 <- as.matrix(pamstats.opt3$R2)
colnames(PAM.R2.Opt3) <- c("R2 opt3")
PAM.R2 <- bind_cols(PAM.R2.Opt1, PAM.R2.Opt2, PAM.R2.Opt3)
PAM.R2 <- PAM.R2 %>% rownames_to_column() %>% melt() %>% mutate(group = "pam")
PAM.R2$rowname <- as.numeric(PAM.R2$rowname)

PAM.ASW %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
  #facet_grid(. ~ group, scales = "free") +
  geom_tile() +
  scale_fill_continuous(name = "ASW value", type = "viridis") +
  labs(title = "Sequencing - ASW (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
  scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
  ylab("number of clusters") +
  ylim(1,21) +
  theme_bw()

PAM.HC %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
  #facet_grid(. ~ group, scales = "free") +
  geom_tile() +
  scale_fill_continuous(name = "ASW value", type = "viridis") +
  labs(title = "Sequencing - ASW (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
  scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
  ylab("number of clusters") +
  ylim(1,21) +
  theme_bw()

PAM.PBC %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
  #facet_grid(. ~ group, scales = "free") +
  geom_tile() +
  scale_fill_continuous(name = "ASW value", type = "viridis") +
  labs(title = "Sequencing - ASW (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
  scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
  ylab("number of clusters") +
  ylim(1,21) +
  theme_bw()

PAM.R2 %>% mutate(clusters = rowname + 1) %>%
ggplot(aes(x = variable, y = clusters, fill = value)) +
  #facet_grid(. ~ group, scales = "free") +
  geom_tile() +
  scale_fill_continuous(name = "ASW value", type = "viridis") +
  labs(title = "Sequencing - ASW (optimal 1)", caption = "Average silhouette width (ASW) is the coherence of assignments, high coherence indicates high between-group distances and strong
within-group homogeneity \n Dissimiarlity options:\n Option 1: Optimal matching of spell sequences, low expansion cost, low weight, using our custom cost matrix \n Option 2: Optimal matching of transition sequences using our custom cost matrix \n Option 3: Optimal matching of transition sequences, low weight, using our custom cost matrix") +
  scale_x_discrete(name = "dissimilarity option", labels = c("1", "2", "3")) +
  ylab("number of clusters") +
  ylim(1,21) +
  theme_bw()

clusterpam <- wcKMedoids(sequencing.opt2, k = 20)
clusterpam$stats

clusterpam_20 <- clusterpam$clustering
clusterpam_20_fac <- factor(clusterpam_20)#, #labels = paste("Type", 1:20))

seqdplot(vet.seq, group = clusterpam_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5", "dark gray"))
seqlegend(vet.seq, cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5", "dark gray"))

vet <- rownames(vet.seq)

vet_df <- as.data.frame(cbind(vet, clusterpam_20))
colnames(vet_df) <- c("id", "cluster")
bg_covariates$id <- as.character(bg_covariates$id)

vet_df <- left_join(vet_df, bg_covariates, by = "id")

degree <- vet_df %>%
  filter(!is.na(highest_degree)) %>%
  group_by(highest_degree) %>%
  mutate(total = n()) %>%
  group_by(cluster, highest_degree) %>%
  mutate(percent = n()/total) %>% distinct(highest_degree, cluster, percent) %>%
  group_by(highest_degree) %>% mutate(sum = sum(percent))

degree$cluster <- as.factor(degree$cluster)
degree$highest_degree <- factor(degree$highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd"))
degree <- degree %>% filter(!is.na(highest_degree))

degree <- as.data.frame(degree)

ggplot(degree, aes(cluster, highest_degree, fill= percent)) + 
  geom_tile()+
  scale_fill_gradient(low="white", high="blue")+
  theme_classic()+
  labs(title = "Clustering X Degree --before normalization")

year_entered_workforce_bin <- vet_df %>%
  filter(!is.na(year_entered_workforce_bin)) %>%
  group_by(year_entered_workforce_bin) %>%
  mutate(total = n()) %>%
  group_by(cluster, year_entered_workforce_bin) %>%
  mutate(percent = n()/total) %>% distinct(cluster, year_entered_workforce_bin, percent) %>%
  group_by(year_entered_workforce_bin) %>% mutate(sum = sum(percent))

year_entered_workforce_bin$cluster <- as.factor(year_entered_workforce_bin$cluster)
#irr_bin$highest_degree <- factor(irr_bin$highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd"))
#degree <- degree %>% filter(!is.na(highest_degree))

year_entered_workforce_bin <- as.data.frame(year_entered_workforce_bin)

seqdplot(vet.seq, group = clusterpam_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5", "dark gray"))

ggplot(year_entered_workforce_bin, aes(cluster, year_entered_workforce_bin, fill= percent)) + 
  geom_tile()+
  scale_fill_gradient(low="white", high="blue")+
  theme_classic()+
  labs(title = "Clustering X Degree --before normalization")


transition_matrix <- seqtrate(seq.20, weighted=FALSE, count=TRUE)
round(transition_matrix,2)
diag(transition_matrix) = 0
round(transition_matrix,2)
rowsum <- apply(transition_matrix, 1, sum)
for(i in 1:nrow(transition_matrix)) {
  for (j in 1:ncol(transition_matrix)){
    transition_matrix[i,j] = transition_matrix[i,j]/rowsum[i]
  }
}
for(j in 1:ncol(transition_matrix)){
      transition_matrix[7,j] = transition_matrix[7,j] = 0
}
transition_matrix <- round(transition_matrix,2)
cost_matrix <- 2-transition_matrix
round(cost_matrix,2)
diag(cost_matrix) = 0
cost_matrix <- round(cost_matrix,2)

sequencing.opt1 <- seqdist(seq.10, method = "OMspell", indel = "auto", sm = cost_matrix, with.missing = TRUE, otto = 0, expcost = 0)

clusterpam <- wcKMedoids(sequencing.opt1, k = 20)
clusterpam$stats

clusterpam_20 <- clusterpam$clustering
clusterpam_20_fac <- factor(clusterpam_20)#, #labels = paste("Type", 1:20))

#seqdplot(seq.20, group = clusterpam_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
#         cpal = c(uva_color_palette[1], uva_color_palette[2],
#                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
#                  uva_color_palette[5], "#CBBEB5", "dark gray"))
#seqlegend(seq.10, cpal = c(uva_color_palette[1], uva_color_palette[2],
#                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
#                  uva_color_palette[5], "#CBBEB5", "dark gray"))

all <- rownames(seq.20)

all_df <- as.data.frame(cbind(all, clusterpam_20))
colnames(all_df) <- c("id", "cluster")
bg_covariates$id <- as.character(bg_covariates$id)

all_df <- left_join(all_df, bg_covariates, by = "id")

veteran <- all_df %>%
  filter(!is.na(veteran)) %>%
  group_by(veteran) %>%
  mutate(total = n()) %>%
  group_by(cluster, veteran) %>%
  mutate(percent = n()/total) %>% distinct(veteran, cluster, percent) %>%
  group_by(veteran) %>% mutate(sum = sum(percent))

veteran$cluster <- as.factor(veteran$cluster)
#degree$veteran <- factor(degree$highest_degree, levels = c("highschool", "GED/diploma", "associates", "bachelors", "masters", "phd"))
veteran <- veteran %>% filter(!is.na(veteran))

veteran <- as.data.frame(veteran)

seqdplot(seq.20, group = clusterpam_20_fac, border = NA, use.layout = TRUE, cols = 5, withlegend = F, sortv = "from.start",
         cpal = c(uva_color_palette[1], uva_color_palette[2],
                  uva_color_palette[4], uva_color_palette[6], uva_color_palette[9],
                  uva_color_palette[5], "#CBBEB5", "dark gray"))
ggplot(veteran, aes(cluster, veteran, fill= percent)) + 
  geom_tile()+
  scale_fill_gradient(low="white", high="blue")+
  theme_classic()+
  labs(title = "Clustering X Degree --before normalization")